## Text Flappy Bird
*Romain Mondelice*

The goal of this assignment is to apply reinforcement learning methods to a
simple game called Text Flappy Bird (TFB). The game is a variation to the
well know Flappy Bird in which the player is made with a simple unit-element
character

## General imports

In [None]:
import os, sys
import gymnasium as gym
import time

import numpy as np
from collections import defaultdict
from tqdm import tqdm
import pickle

import text_flappy_bird_gym

## Preprocess env

In [None]:
# initiate environment
env = gym.make('TextFlappyBird-v0', height = 15, width = 20, pipe_gap = 4)
obs = env.reset()

In [None]:
total_reward = 0
while True:
        # Select next action
        action = env.action_space.sample()  # for an agent, action = agent.policy(observation)

        # Appy action and return new observation of the environment
        obs, reward, done, _, info = env.step(action)
        total_reward += reward

        # Render the game
        os.system("clear")
        sys.stdout.write(env.render())
        time.sleep(0.2) # FPS

        # If player is dead break
        if done:
            break

In [None]:
print(total_reward)
env.close()

## Monte Carlo based agent

In [124]:
from collections import defaultdict
import numpy as np
import random

class MonteCarloControlAgent:
    def __init__(self, env, gamma=0.9, epsilon=0.1):
        self.env = env
        self.gamma = gamma  # discount factor
        self.epsilon = epsilon  # exploration rate
        self.q_table = defaultdict(lambda: np.zeros(env.action_space.n))  # action-value table
        self.returns = defaultdict(list)  # store returns for each state-action pair
        self.policy = defaultdict(lambda: env.action_space.sample())  # initial policy is still random

    def encode_state(self, state):
        return str(state)

    def generate_episode(self):
        episode = []
        state = self.env.reset()
        done = False
        while not done:
            encoded_state = self.encode_state(state)
            action = self.epsilon_greedy_action(encoded_state)
            next_state, reward, done, _, info = self.env.step(action)
            episode.append((encoded_state, action, reward))
            state = next_state
        return episode

    def epsilon_greedy_action(self, encoded_state):
        if random.random() > self.epsilon:  # Exploit
            return np.argmax(self.q_table[encoded_state])
        else:  # Explore
            return self.env.action_space.sample()

    def update_q_values(self, episode):
        G = 0
        for encoded_state, action, reward in reversed(episode):
            G = self.gamma * G + reward
            sa_pair = (encoded_state, action)
            if sa_pair not in [(x[0], x[1]) for x in episode[:-1]]:
                self.returns[sa_pair].append(G)
                self.q_table[encoded_state][action] = np.mean(self.returns[sa_pair])
    
    def improve_policy(self):
        for encoded_state in self.q_table:
            self.policy[encoded_state] = np.argmax(self.q_table[encoded_state])

In [128]:
def train_monte_carlo_agent(env_str, episodes=1000, gamma=0.9, epsilon=0.1):
    env = gym.make(env_str, height=15, width=20, pipe_gap=4)
    agent = MonteCarloControlAgent(env, gamma=gamma, epsilon=epsilon)

    for episode in tqdm(range(episodes), desc="Training process"):
        episode_data = agent.generate_episode()
        agent.update_q_values(episode_data)  # Use the correct method to update Q-values
        agent.improve_policy()
    
    print("Training completed.")
    return agent

In [129]:
def test_agent(agent, episodes=100):
    total_rewards = 0
    for _ in range(episodes):
        state = agent.env.reset()
        done = False
        while not done:
            encoded_state = agent.encode_state(state) 
            action = agent.policy[encoded_state]
            state, reward, done, _, info = agent.env.step(action)
            total_rewards += reward
            
    print("Total reward: ", total_rewards)
    avg_reward = total_rewards / episodes
    print(f"Average Reward over {episodes} episodes: {avg_reward}")

### Text Flappy Bird Screen env

#### Training

In [130]:
trained_agent = train_monte_carlo_agent('TextFlappyBird-screen-v0', episodes=1000)

NameError: name 'MonteCarloControlAgent' is not defined

#### Testing

In [109]:
# Test the trained Monte Carlo agent
test_agent(trained_agent)

Total reward:  1249
Average Reward over 100 episodes: 12.49


#### Save agent

In [110]:
# Assuming `trained_agent` is your Monte Carlo agent that you have trained
with open('../agents/mc-agent-screen.pkl', 'wb') as f:
    pickle.dump(trained_agent, f)

### Text Flappy Bird env

#### Training

In [120]:
trained_agent = train_monte_carlo_agent('TextFlappyBird-v0', episodes=100000)

Training process:: 100%|██████████| 100000/100000 [00:51<00:00, 1938.98it/s]

Training completed.





#### Testing

In [122]:
test_agent(trained_agent)

Total reward:  1216
Average Reward over 100 episodes: 12.16


#### Save

In [123]:
# Assuming `trained_agent` is your Monte Carlo agent that you have trained
with open('../agents/mc-agent.pkl', 'wb') as f:
    pickle.dump(trained_agent, f)

## Sarsa based agent

In [None]:
class SarsaLambdaAgent:
    def __init__(self, env, lambda_):
        self.env = env
        self.lambda_ = lambda_
        # Initialize Q-values and eligibility traces
        # Define epsilon for epsilon-greedy policy
    
    def policy(self, observation):
        # Define epsilon-greedy policy here
        return action
    
    def update_q_values(self, state, action, reward, next_state, next_action):
        # Update Q-values using the Sarsa(λ) formula
    
    def update_eligibility_traces(self, state, action):
        # Update eligibility traces
