In [1]:
import pygame
import utils.state as game_state
import utils.static as game_static
from utils.ModifiedTensorBoard import ModifiedTensorBoard
from utils.PositionalController import PositionalController
from game import AbstractGame

import time
from tqdm import tqdm

import numpy as np
import tensorflow as tf

pygame 2.6.1 (SDL 2.28.4, Python 3.10.14)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
import utils.state as game_state
import utils.static as game_static
from utils.ModifiedTensorBoard import ModifiedTensorBoard
import pygame



from collections import deque
import time
from tqdm import tqdm
import random


import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam




REPLAY_MEMORY_SIZE = 8_000  # How many last steps to keep for model training
MIN_REPLAY_MEMORY_SIZE = 500  
MINIBATCH_SIZE = 64  # How many steps (samples) to use for training
UPDATE_TARGET_EVERY = 10  # Terminal states (end of episodes)



class PositionalController:
    def __init__(self, model_name,lr,discount,number_of_states,number_of_actions) -> None:
        self.lr = lr
        self.discount = discount
        self.number_of_states = number_of_states
        self.number_of_actions = number_of_actions
        
        self.model = self.create_model() # main model

        self.target_model = self.create_model() # target model
        self.target_model.set_weights(self.model.get_weights())
        self.target_update_counter = 0 #when to update target network with main network's weights

        self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE) # array with last n steps for minibatch training

        self.tensorboard = ModifiedTensorBoard(log_dir="logs/{}-{}".format(model_name, int(time.time()))) #logging

        
    def create_model(self):
        model = Sequential([
            Dense(128, input_shape=(self.number_of_states,), activation='relu'),
            Dense(128, activation='relu'),
            Dense(128, activation='relu'),
            Dense(self.number_of_actions, activation='linear'),
        ])        
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.lr))
        return model
        
    def get_features(self, state):
        player_info = [
            state.player_entity.entity.height/game_static.MAX_ENTITY_SIZE,
            state.player_entity.entity.width/game_static.MAX_ENTITY_SIZE,
            state.player_entity.friction,
            state.player_entity.acc_factor,
             
            state.player_entity.entity.x/game_static.GAME_WIDTH,  
            state.player_entity.entity.y/game_static.GAME_HEIGHT,
            state.player_entity.velocity.x,
            state.player_entity.velocity.y
        ]
        goal_info = [
            state.goal_entity.x/game_static.GAME_WIDTH,
            state.goal_entity.y/game_static.GAME_HEIGHT,
            state.goal_entity.height/game_static.MAX_ENTITY_SIZE,
            state.goal_entity.width/game_static.MAX_ENTITY_SIZE,
            # distance to goal
            (
                np.sqrt(
                    (state.player_entity.entity.x - state.goal_entity.x)**2 
                    + (state.player_entity.entity.y - state.goal_entity.y)**2
                ) / game_static.GAME_DIAGONAL
            )
        ]
        enemy_info = [
            [
                enemy_entity.entity.x/game_static.GAME_WIDTH,
                enemy_entity.entity.y/game_static.GAME_HEIGHT,
                enemy_entity.entity.height/game_static.MAX_ENTITY_SIZE,
                enemy_entity.entity.width/game_static.MAX_ENTITY_SIZE,
                enemy_entity.velocity.x,
                enemy_entity.velocity.y,
                # distance from the player to the enemy
                np.sqrt(
                        (state.player_entity.entity.x - enemy_entity.entity.x)**2 
                        + (state.player_entity.entity.y - enemy_entity.entity.y)**2
                ) / game_static.GAME_DIAGONAL
            ]
            for enemy_entity in state.enemy_collection
        ]
        return player_info, goal_info, enemy_info
    
    
    def get_action(self, state):
        player_info, goal_info, enemy_info = self.get_features(state)
        state_tensor = tf.convert_to_tensor([float(f) for feature_list in [player_info, goal_info, *enemy_info] for f in feature_list])
        
        # deciding which action to take
        action = self.model.predict(np.array(state_tensor).reshape(-1, self.number_of_states),verbose=0)[0]
        return game_state.GameActions(np.argmax(action))


    def update_replay_memory(self, transition):
        # transition = (observation space, action, reward, new observation space, done)
        self.replay_memory.append(transition)

    # Trains main network every step during episode
    def train(self,done):
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE: # starting after certain number of steps
            return

        minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)

        current_states = tf.convert_to_tensor([transition[0] for transition in minibatch]) # get current states from minibatch
        current_qs_list = self.model.predict(current_states,verbose=0) # query the model for Q values

        new_states = np.array([transition[3] for transition in minibatch]) # get future states from minibatch
        future_qs_list = self.target_model.predict(new_states, verbose=0) # use target model to predict future states

        X = []
        y = []

        # Now we need to enumerate our batches
        for index, (current_state, action, reward, new_state, done) in enumerate(minibatch):

            
            if not done: # if the game hasn't ended, get new q from future states
                max_future_q = np.max(future_qs_list[index])
                new_q = reward + self.discount * max_future_q
            else:
                new_q = reward # otherwise set it to 0

            # Update Q value for given state
            current_qs = current_qs_list[index]
            current_qs[action] = new_q

            # And append to our training data
            X.append(current_state)
            y.append(current_qs)

        # fit the main model using the batch
        self.model.fit(
            np.array(X), np.array(y),
            batch_size=MINIBATCH_SIZE, verbose=0, shuffle=False,
            callbacks=[self.tensorboard])

        if done: # if done -> episode is over
            self.target_update_counter += 1 # don't update weights of target model every episode

        # after a set number of episodes update the target model
        if self.target_update_counter > UPDATE_TARGET_EVERY:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter = 0





In [3]:
MODEL_NAME = 'PC1'
LR = 1e-3
DISCOUNT = 0.99
MIN_REWARD = -50  # For model save
EPISODES = 200

NUMBER_OF_STATES = (
    game_static.ENEMY_COUNT * 6 # each enemy has 6 features: x,y position and velocity, as well as height and width
    + game_static.ENEMY_COUNT # distance from the player to the enemy
    + 8 # player has 8 features: x,y position and velocity, height and width, friction, acceleration factor
    + 4 # goal has 4 features: x,y position, height and width
    + 1 # distance to goal
    
)
NUMBER_OF_ACTIONS = 9

# Exploration settings
epsilon = 1  # decayed over time
EPSILON_DECAY = 0.98
MIN_EPSILON = 0.01

#  Stats settings
AGGREGATE_STATS_EVERY = 10  # episodes

agent = PositionalController(MODEL_NAME,LR, DISCOUNT,NUMBER_OF_STATES,NUMBER_OF_ACTIONS)

In [7]:
my_game = AbstractGame(agent)

ep_rewards = []
ep_goals = []
ep_steps = []

# Iterate over episodes
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):

    # Update tensorboard step every episode
    agent.tensorboard.step = episode

    # Restarting episode - reset episode reward and step number
    episode_reward = 0
    episode_step = 0
    goals_scored = 0 
    steps = 0
    step_when_last_goal_scored = 0
    # get initial state
    my_game.reset()
    current_state = my_game.get_current_state()

    # Reset flag and start iterating until episode ends
    while not my_game.done:
        episode_step += 1
        # This part stays mostly the same, the change is to query a model for Q values
        if np.random.random() > epsilon:
            action_enum = agent.get_action(current_state)
        else:
            action_enum = game_state.GameActions(
                np.random.randint(0, NUMBER_OF_ACTIONS)
            )
 
        # Do action and get what is the current_observation
        my_game.update_frame(action_enum)
        
        new_state = my_game.get_current_state()        
        player_info, goal_info, enemy_info = agent.get_features(new_state)
                
        is_goal_scored = new_state.current_observation.value == 1
        if is_goal_scored:
            goals_scored += 1
            step_when_last_goal_scored = episode_step 
            goal_scored_reward = 50 
        else:
            goal_scored_reward = 0
        enemy_attacked_penalty = -100 if new_state.current_observation.value == -1 else 0
        goal_distance_penalty = -goal_info[-1]
        time_penalty = -0.01  

        reward = goal_scored_reward + goal_distance_penalty + time_penalty + enemy_attacked_penalty
        
        episode_reward += reward

        # update replay memory and train model(s)
        player_info, goal_info, enemy_info = agent.get_features(current_state)
        current_state_tensor = tf.convert_to_tensor([float(f) for feature_list in [player_info, goal_info, *enemy_info] for f in feature_list])

        player_info, goal_info, enemy_info = agent.get_features(new_state)
        new_state_tensor = tf.convert_to_tensor([float(f) for feature_list in [player_info, goal_info, *enemy_info] for f in feature_list])

        agent.update_replay_memory((current_state_tensor, action_enum.value, reward, new_state_tensor, my_game.done))
        agent.train(my_game.done)
        
        
        current_state = new_state
        my_game.__innerState = current_state
        if episode_step - step_when_last_goal_scored > 250 and goals_scored == 0:
            episode_reward -= 100 # for tensorboard logging, so that there are no falsely high rewards with no goals scored
            break # no progress, and hasn't scored a goal in a while
        # my_game.draw_frame()
    
        

    # Append episode reward to a list and log stats (every given number of episodes)
    ep_rewards.append(episode_reward)
    ep_goals.append(goals_scored)
    ep_steps.append(episode_step)
    if not episode % AGGREGATE_STATS_EVERY or episode == 1:
        average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
        min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
        max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
        
        average_goals = sum(ep_goals[-AGGREGATE_STATS_EVERY:])/len(ep_goals[-AGGREGATE_STATS_EVERY:])
        min_goals = min(ep_goals[-AGGREGATE_STATS_EVERY:])
        max_goals = max(ep_goals[-AGGREGATE_STATS_EVERY:])
        
        average_steps = sum(ep_steps[-AGGREGATE_STATS_EVERY:])/len(ep_steps[-AGGREGATE_STATS_EVERY:])
        min_steps = min(ep_steps[-AGGREGATE_STATS_EVERY:])
        max_steps = max(ep_steps[-AGGREGATE_STATS_EVERY:])
        
        agent.tensorboard.update_stats(
            epsilon=epsilon, reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward,
            goals_avg=average_goals, goals_min=min_goals, goals_max=max_goals,
            steps_avg=average_steps, steps_min=min_steps, steps_max=max_steps
        )

        if max_reward >= MIN_REWARD: # save model when it's good enough
            agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')


    # decay epsilon
    if epsilon > MIN_EPSILON:
        epsilon *= EPSILON_DECAY
        epsilon = max(MIN_EPSILON, epsilon)
        
pygame.quit()

 70%|######9   | 139/200 [27:39<18:55, 18.62s/episodes]

INFO:tensorflow:Assets written to: models/PC1___-38.80max_-127.75avg_-180.16min__1732539782.model\assets


INFO:tensorflow:Assets written to: models/PC1___-38.80max_-127.75avg_-180.16min__1732539782.model\assets
100%|##########| 200/200 [43:12<00:00, 12.96s/episodes]


In [None]:
MODEL_NAME = 'PC2'
LR = 1e-3
DISCOUNT = 0.99
MIN_REWARD = -75  # For model save
EPISODES = 300

NUMBER_OF_STATES = (
    game_static.ENEMY_COUNT * 6 # each enemy has 6 features: x,y position and velocity, as well as height and width
    + game_static.ENEMY_COUNT # distance from the player to the enemy
    + 8 # player has 8 features: x,y position and velocity, height and width, friction, acceleration factor
    + 4 # goal has 4 features: x,y position, height and width
    + 1 # distance to goal
    
)
NUMBER_OF_ACTIONS = 9


# Exploration settings
epsilon = 1  # decayed over time
EPSILON_DECAY = 0.99
MIN_EPSILON = 0.01

#  Stats settings
AGGREGATE_STATS_EVERY = 10  # episodes

agent = PositionalController(MODEL_NAME,LR, DISCOUNT,NUMBER_OF_STATES,NUMBER_OF_ACTIONS)

In [None]:
my_game = AbstractGame(agent)

ep_rewards = []
ep_goals = []
ep_steps = []

# Iterate over episodes
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):

    # Update tensorboard step every episode
    agent.tensorboard.step = episode

    # Restarting episode - reset episode reward and step number
    episode_reward = 0
    episode_step = 0
    goals_scored = 0 
    steps = 0
    step_when_last_goal_scored = 0
    # get initial state
    my_game.reset()
    current_state = my_game.get_current_state()

    # Reset flag and start iterating until episode ends
    while not my_game.done:
        episode_step += 1
        # This part stays mostly the same, the change is to query a model for Q values
        if np.random.random() > epsilon:
            action_enum = agent.get_action(current_state)
        else:
            action_enum = game_state.GameActions(
                np.random.randint(0, NUMBER_OF_ACTIONS)
            )
 
        # Do action and get what is the current_observation
        my_game.update_frame(action_enum)
        
        new_state = my_game.get_current_state()        
        player_info, goal_info, enemy_info = agent.get_features(new_state)
                
        is_goal_scored = new_state.current_observation.value == 1
        if is_goal_scored:
            goals_scored += 1
            step_when_last_goal_scored = episode_step 
            goal_scored_reward = 50 
        else:
            goal_scored_reward = 0
        enemy_attacked_penalty = -100 if new_state.current_observation.value == -1 else 0
        goal_distance_penalty = -goal_info[-1]
        time_penalty = -0.01  

        reward = goal_scored_reward + goal_distance_penalty + time_penalty + enemy_attacked_penalty
        
        episode_reward += reward

        # update replay memory and train model(s)
        player_info, goal_info, enemy_info = agent.get_features(current_state)
        current_state_tensor = tf.convert_to_tensor([float(f) for feature_list in [player_info, goal_info, *enemy_info] for f in feature_list])

        player_info, goal_info, enemy_info = agent.get_features(new_state)
        new_state_tensor = tf.convert_to_tensor([float(f) for feature_list in [player_info, goal_info, *enemy_info] for f in feature_list])

        agent.update_replay_memory((current_state_tensor, action_enum.value, reward, new_state_tensor, my_game.done))
        agent.train(my_game.done)
        
        
        current_state = new_state
        my_game.__innerState = current_state
        if episode_step - step_when_last_goal_scored > 250 and goals_scored == 0:
            episode_reward -= 100 # for tensorboard logging, so that there are no falsely high rewards with no goals scored
            break # no progress, and hasn't scored a goal in a while
        # my_game.draw_frame()
    
        

    # Append episode reward to a list and log stats (every given number of episodes)
    ep_rewards.append(episode_reward)
    ep_goals.append(goals_scored)
    ep_steps.append(episode_step)
    if not episode % AGGREGATE_STATS_EVERY or episode == 1:
        average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
        min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
        max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
        
        average_goals = sum(ep_goals[-AGGREGATE_STATS_EVERY:])/len(ep_goals[-AGGREGATE_STATS_EVERY:])
        min_goals = min(ep_goals[-AGGREGATE_STATS_EVERY:])
        max_goals = max(ep_goals[-AGGREGATE_STATS_EVERY:])
        
        average_steps = sum(ep_steps[-AGGREGATE_STATS_EVERY:])/len(ep_steps[-AGGREGATE_STATS_EVERY:])
        min_steps = min(ep_steps[-AGGREGATE_STATS_EVERY:])
        max_steps = max(ep_steps[-AGGREGATE_STATS_EVERY:])
        
        agent.tensorboard.update_stats(
            epsilon=epsilon, reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward,
            goals_avg=average_goals, goals_min=min_goals, goals_max=max_goals,
            steps_avg=average_steps, steps_min=min_steps, steps_max=max_steps
        )

        if max_reward >= MIN_REWARD: # save model when it's good enough
            agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')


    # decay epsilon
    if epsilon > MIN_EPSILON:
        epsilon *= EPSILON_DECAY
        epsilon = max(MIN_EPSILON, epsilon)
        
pygame.quit()

 30%|##9       | 89/300 [11:01<17:44,  5.05s/episodes]  

INFO:tensorflow:Assets written to: models/PC2___-60.61max_-118.46avg_-220.43min__1732545769.model\assets


INFO:tensorflow:Assets written to: models/PC2___-60.61max_-118.46avg_-220.43min__1732545769.model\assets
 36%|###6      | 109/300 [14:24<28:30,  8.95s/episodes]

INFO:tensorflow:Assets written to: models/PC2___-62.03max_-114.38avg_-176.37min__1732545968.model\assets


INFO:tensorflow:Assets written to: models/PC2___-62.03max_-114.38avg_-176.37min__1732545968.model\assets
 51%|#####1    | 154/300 [22:47<30:49, 12.67s/episodes]  