# My First Flappy Bird DQN

# Imports and device setup

In [2]:
import flappy_bird_gymnasium
import gymnasium as gym
import numpy as np
import os

import torch
from torch import nn
import torch.nn.functional as F
import yaml

from datetime import datetime, timedelta
import argparse
import itertools

from collections import deque
import random

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt



# For printing date and time
DATE_FORMAT = "%m-%d %H:%M:%S"

RUN_DIR = "runs"
os.makedirs(RUN_DIR, exist_ok=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

yaml_text = """
cartpole1:
    env_id: CartPole-v1
    replay_memory_size: 100000
    mini_batch_size: 32
    epsilon_init: 1
    epsilon_decay: 0.99995
    min_epsilon: 0.05
    network_sync_rate: 10
    learning_rate_a: 0.002
    discount_factor_g: 0.99
    stop_on_reward: 4000
    fc1_nodes: 64

flappybird1:
    env_id: FlappyBird-v0
    replay_memory_size: 100000
    mini_batch_size: 64
    epsilon_init: 1.0
    epsilon_decay: 0.99995
    min_epsilon: 0.05
    network_sync_rate: 10
    learning_rate_a: 0.00001
    discount_factor_g: 0.99
    stop_on_reward: 2000
    fc1_nodes: 512
    env_make_params:
        use_lidar: false
"""

all_hyperparameter_sets = yaml.safe_load(yaml_text)

# Classes

In [3]:
# DQN Network
class DQN(nn.Module):
    
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super(DQN, self).__init__()

        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)



# Replay Memory
class ReplayMemory:
    def __init__(self, maxlen, seed=None):
        self.memory = deque([], maxlen=maxlen)

        if seed is not None:
            random.seed(seed)

    def append(self, transition):
        self.memory.append(transition)

    def sample(self, sample_size):
        return random.sample(self.memory, sample_size)

    def __len__(self):
        return len(self.memory)



# Agent
class Agent:

    def __init__(self, hyperparameter_set):
        hyperparameter = all_hyperparameter_sets[hyperparameter_set]

        # Attributes
        self.hyperparameter_set = hyperparameter_set
        self.env_id = hyperparameter['env_id']
        self.replay_memory_size = hyperparameter['replay_memory_size']
        self.mini_batch_size = hyperparameter['mini_batch_size']
        self.epsilon_init = hyperparameter['epsilon_init']
        self.epsilon_decay = hyperparameter['epsilon_decay']
        self.min_epsilon = hyperparameter['min_epsilon']
        self.network_sync_rate = hyperparameter['network_sync_rate']
        self.learning_rate_a = hyperparameter['learning_rate_a']
        self.discount_factor_g = hyperparameter['discount_factor_g']
        self.stop_on_reward = hyperparameter['stop_on_reward']
        self.fc1_nodes = hyperparameter['fc1_nodes']
        self.env_make_params = hyperparameter['env_make_params']

        self.loss_fn = nn.MSELoss()    # NN loss function using the MSE-Mean Squared Error
        self.optimizer = None    # Initialize optimizer to None for now

        self.rewards_per_episode = []
        self.epsilon_history = []

        # Path to Run info
        self.LOG_FILE = os.path.join(RUN_DIR, f'{self.hyperparameter_set}.log')
        self.MODEL_FILE = os.path.join(RUN_DIR, f'{self.hyperparameter_set}.pt')
        self.GRAPH_FILE = os.path.join(RUN_DIR, f'{self.hyperparameter_set}.png')

        self.policy_dqn = None
    
    def run (self, is_training=False, render=False):
        #env = gym.make('FlappyBird-v0', render_mode='human' if render else None, use_lidar=False)
        if is_training:
            env = gym.make('FlappyBird-v0', render_mode='human' if render else None, use_lidar=self.env_make_params.get('use_lidar', False))
        else:
            env = gym.make('FlappyBird-v0', render_mode='human' if render else None, use_lidar=False)

        num_states = env.observation_space.shape[0]
        num_actions = env.action_space.n

        self.policy_dqn = DQN(num_states, num_actions, self.fc1_nodes).to(device)

        # Initialize ReplayMemory if training
        if is_training:
            memory = ReplayMemory(self.replay_memory_size)

            epsilon = self.epsilon_init

            target_dqn = DQN(num_states, num_actions, self.fc1_nodes).to(device)
            target_dqn.load_state_dict(self.policy_dqn.state_dict())

            step_count = 0    # Initialize a step counter

            # Optimizer using Adam to update the policy network parameters using defined learning rate
            self.optimizer = torch.optim.Adam(self.policy_dqn.parameters(), lr=self.learning_rate_a)

            # Track best reward for training
            best_reward = float('-inf')
        else:
            # Load learned policy
            self.policy_dqn.load_state_dict(torch.load(self.MODEL_FILE))

            # Switch model to evaluation mode
            self.policy_dqn.eval()

            epsilon = self.epsilon_init   # Keep epsilon initialized even after training

            # Track best reward for evaluation
            best_reward = float('-inf')


        # Run the game indefinitely
        for episode in itertools.count():     
            state, _ = env.reset()
            state = torch.tensor(state, dtype=torch.float, device=device)
            
            terminated = False
            episode_reward = 0.0
            
            while not terminated:                
                if is_training and random.random() < epsilon:
                    action = env.action_space.sample()
                    action = torch.tensor(action, dtype=torch.int64, device=device)
                else:
                    with torch.no_grad():
                        # Converts the 1D tensor into 2D and then convert it back to 1D, then output is the highest action index
                        action = self.policy_dqn(state.unsqueeze(dim=0)).squeeze().argmax()
    
                # Processing
                new_state, reward, terminated, _, info = env.step(action.item())

                # Accumulate reward
                episode_reward += reward

                # Convert new state and reward to tensors in device
                new_state = torch.tensor(new_state, dtype=torch.float, device=device)
                reward = torch.tensor(reward, dtype=torch.float, device=device)
    
                if is_training:
                    memory.append((state, action, new_state, reward, terminated))

                    step_count += 1    # Increment step counter
                
                # Move to new state
                state = new_state


            if is_training:
                if episode_reward > best_reward:
                    log_message = f"{datetime.now().strftime(DATE_FORMAT)}: New best reward: {episode_reward:0.1f} ({(episode_reward-best_reward)})"
                    print(log_message)
                    print(f'---Episode {episode}, Reward: {episode_reward: .1f}, Epsilon: {epsilon: .4f}')
                    with open(self.LOG_FILE, 'a') as file:
                        file.write(log_message + '\n')

                    torch.save(self.policy_dqn.state_dict(), self.MODEL_FILE)
                    best_reward = episode_reward

            if is_training:
                epsilon = max(epsilon * self.epsilon_decay, self.min_epsilon)

            self.rewards_per_episode.append(episode_reward)
            self.epsilon_history.append(epsilon)

            if is_training and len(memory) > self.mini_batch_size:
                
                mini_batch = memory.sample(self.mini_batch_size)    # Sample from memory
                
                self.optimize(mini_batch, self.policy_dqn, target_dqn)

                # Update the policy target with the policy network after a certain number of steps
                if step_count >= self.network_sync_rate:
                    target_dqn.load_state_dict(self.policy_dqn.state_dict())
                    step_count = 0

            # Stop training when reward is more or equal to stopping reward
            if is_training and best_reward >= self.stop_on_reward:
                print(f"Best reward reached: {best_reward}. Stopping training.")
                break

            if not is_training:
                break

    
    def save_graph(self, rewards_per_episode, epsilon_history):
        # Save plots
        fig = plt.figure(1)

        # Plot average rewards (Y-axis) vs episodes (X-axis)
        mean_rewards = np.zeros(len(rewards_per_episode))
        for x in range(len(mean_rewards)):
            mean_rewards[x] = np.mean(rewards_per_episode[max(0, x-99):(x+1)])
        plt.subplot(121) # plot on a 1 row x 2 col grid, at cell 1
        # plt.xlabel('Episodes')
        plt.ylabel('Mean Rewards')
        plt.plot(mean_rewards)

        # Plot epsilon decay (Y-axis) vs episodes (X-axis)
        plt.subplot(122) # plot on a 1 row x 2 col grid, at cell 2
        # plt.xlabel('Time Steps')
        plt.ylabel('Epsilon Decay')
        plt.plot(epsilon_history)

        plt.subplots_adjust(wspace=1.0, hspace=1.0)

        # Save plots
        fig.savefig(self.GRAPH_FILE)
        plt.close(fig)
        
                    
    def optimize(self, mini_batch, policy_dqn, target_dqn):
        states, actions, new_states, rewards, terminations = zip(*mini_batch)    # Transpose the list of exp and separate each element

        states = torch.stack(states)    # Stack tensors into batch tensors, ([[1, 4], [2, 4]])

        actions = torch.stack(actions)

        new_states = torch.stack(new_states)

        rewards = torch.stack(rewards)
        terminations = torch.tensor(terminations).float().to(device)    # Convert terminations to tensor and float, to GPU

        # Calculate the Q-values with Bellman and no gradient updates
        with torch.no_grad():
            target_q = rewards + (1 - terminations) * self.discount_factor_g * target_dqn(new_states).max(dim=1)[0]
        
        current_q = policy_dqn(states).gather(dim=1, index=actions.unsqueeze(dim=1)).squeeze()    # Take actions and return q-values

        loss = self.loss_fn(current_q, target_q)    # Get the loss from current and target q-values

        # Optimize the model
        self.optimizer.zero_grad()    # Remove the previous gradients from the previous step
        loss.backward()    # Get the weights
        self.optimizer.step()    # Reduce the loss so the next current_q is closer to the target_q

# Create the agent
agent = Agent('flappybird1')

# To train the agent, uncomment this out if you want to re-train the whole model
# agent.run(is_training=True)

# To test the trained agent
agent.policy_dqn.load_state_dict(torch.load(agent.MODEL_FILE))
agent.policy_dqn.eval()  # Switch to eval mode
agent.run(is_training=False, render=True)

AttributeError: 'NoneType' object has no attribute 'load_state_dict'

# Main execution / testing

In [9]:
# To test the trained agent
agent.policy_dqn.load_state_dict(torch.load(agent.MODEL_FILE))
agent.policy_dqn.eval()  # Switch to eval mode
agent.run(is_training=False, render=True)

  agent.policy_dqn.load_state_dict(torch.load(agent.MODEL_FILE))
  self.policy_dqn.load_state_dict(torch.load(self.MODEL_FILE))


# Play Flappy Bird Game!

In [9]:
agent = Agent('flappybird1')

agent.run(is_training=False, render=True)

  self.policy_dqn.load_state_dict(torch.load(self.MODEL_FILE))
