## Text Flappy Bird
**Romain Mondelice**

The goal of this assignment is to apply reinforcement learning methods to a
simple game called Text Flappy Bird (TFB). The game is a variation to the
well know Flappy Bird in which the player is made with a simple unit-element
character

---

## **General imports**

In [None]:
import os, sys
import gymnasium as gym
import time

import numpy as np
from collections import defaultdict
from tqdm import tqdm
import pickle
import random

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import itertools

import text_flappy_bird_gym

---

## **Monte Carlo based agent**

In [None]:
class OffPolicyMonteCarloAgent:
    def __init__(self, env, gamma=0.9, epsilon=0.1):
        self.env = env
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = defaultdict(self.zero_action_value)
        self.C = defaultdict(self.zero_action_value)
        self.target_policy = defaultdict(int)

    def zero_action_value(self):
        return np.zeros(self.env.action_space.n)
        
    def generate_episode(self, policy):
        episode = []
        state = self.env.reset()
        done = False
        while not done:
            # Convert state to a string representation.
            str_state = str(state)

            if str_state in policy:
                action_probs = policy[str_state]
                action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            else:
                # Fallback if the state is not in the policy, use uniform random selection
                action = self.env.action_space.sample()

            next_state, reward, done, _, info = self.env.step(action)
            episode.append((str_state, action, reward))
            state = next_state
        return episode
    
    def get_probs(self, Q_s, epsilon, nA):
        """Obtains the policy for a given state"""
        policy_s = np.ones(nA) * epsilon / nA
        best_a = np.argmax(Q_s)
        policy_s[best_a] = 1 - epsilon + (epsilon / nA)
        return policy_s
    
    def update_Q(self, episode):
        G = 0.0
        W = 1.0
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = self.gamma * G + reward
            self.C[state][action] += W
            self.Q[state][action] += (W / self.C[state][action]) * (G - self.Q[state][action])
            self.target_policy[state] = np.argmax(self.Q[state])
            
            if action != self.target_policy[state]:
                break
            W = W * 1./self.get_probs(self.Q[state], self.epsilon, self.env.action_space.n)[action]
            
    def train(self, num_episodes):
        for i_episode in range(1, num_episodes + 1):
            episode = self.generate_episode(policy=self.create_behavior_policy(self.Q))
            self.update_Q(episode)
    
    def create_behavior_policy(self, Q):
        """Creates a behavior policy using ε-greedy approach based on Q."""
        behavior_policy = {}
        for state, actions in Q.items():
            behavior_policy[state] = self.get_probs(actions, self.epsilon, self.env.action_space.n)
        return behavior_policy

In [None]:
def train_monte_carlo_agent(env_str, episodes=1000, gamma=0.9, epsilon=0.1):
    env = gym.make(env_str, height=15, width=20, pipe_gap=4)
    agent = OffPolicyMonteCarloAgent(env, gamma=gamma, epsilon=epsilon)
    
    episode_rewards = []  # List to store total reward from each episode

    for _ in tqdm(range(episodes), desc="Training process"):
        # Create the behavior policy from current Q
        behavior_policy = agent.create_behavior_policy(agent.Q)
        # Generate an episode using the behavior policy
        episode = agent.generate_episode(behavior_policy)
        # Update Q-values based on the episode
        agent.update_Q(episode)
        
        # Calculate total reward for the episode and store it
        total_reward = sum([reward for (_, _, reward) in episode])
        episode_rewards.append(total_reward)
    
    print("Training completed.")
    return agent, episode_rewards

In [None]:
def test_agent(agent, episodes=100, reward_threshold=10000):
    total_rewards = 0
    episode_scores = []  # To store the score of each episode

    for episode_num in tqdm(range(1, episodes + 1), desc="Testing episodes"):
        state = agent.env.reset()
        done = False
        episode_reward = 0

        while not done:
            # Convert state to a string representation for consistency.
            str_state = str(state)
            
            # Use the target_policy for action selection if this state has been seen.
            # Otherwise, select a random action.
            if str_state in agent.target_policy:
                action = agent.target_policy[str_state]
            else:
                action = agent.env.action_space.sample()

            state, reward, done, _, info = agent.env.step(action)
            episode_reward += reward

            # Check if the reward threshold for this episode has been exceeded
            if episode_reward > reward_threshold:
                break

        # Episode is done or threshold exceeded, append its total reward to episode_scores
        episode_scores.append(episode_reward)
        total_rewards += episode_reward
    
    avg_reward = total_rewards / episodes
    print("Total reward across all episodes: ", total_rewards)
    print(f"Average Reward over {episodes} episodes: {avg_reward}")
    return avg_reward

### **Text Flappy Bird Screen env**

#### *Sensitivity analysis*
Conducted over 1000 episodes, the goal is to check the sensitivity of different hyperparameters to be able to find optimal ones.

In [None]:
gammas = [0.8, 0.85, 0.9, 0.95, 1.0]  # Gamma values
epsilons = [0.1, 0.2, 0.3, 0.4, 0.5]  # Epsilon values
performance_metrics = np.zeros((len(gammas), len(epsilons)))

In [None]:
for i, gamma in enumerate(gammas):
    for j, epsilon in enumerate(epsilons):
        trained_agent, episode_rewards = train_monte_carlo_agent('TextFlappyBird-screen-v0', episodes=5000, gamma=gamma, epsilon=epsilon)
        total_reward = test_agent(trained_agent, episodes=100)
        performance_metrics[i, j] = total_reward

In [None]:
# Plotting
plt.figure(figsize=(10, 8))
sns.heatmap(performance_metrics, xticklabels=epsilons, yticklabels=gammas, annot=True, cmap="YlGnBu")
plt.xlabel('Epsilon')
plt.ylabel('Gamma')
plt.title('Agent Performance for Different Gamma and Epsilon Values')

plot_file_path = "../reports/figures/sensitivity_analysis_mc_screen.png"
plt.savefig(plot_file_path)

#### *Best params training*

In [None]:
# Get best hyper param epsilon and gamma
# Find the position of the maximum value in the performance metrics matrix
max_value_index = np.unravel_index(performance_metrics.argmax(), performance_metrics.shape)

# Retrieve the corresponding epsilon and gamma values
best_epsilon = epsilons[max_value_index[1]]  # Column index for epsilon
best_gamma = gammas[max_value_index[0]]  # Row index for gamma

# Train
trained_agent, episode_rewards = train_monte_carlo_agent('TextFlappyBird-screen-v0', episodes=25000, gamma=best_gamma, epsilon=best_epsilon)

# Save agent and episode reward history
episode_rewards_df = pd.DataFrame(episode_rewards, columns=["Reward"])
episode_rewards_df.to_csv('../saves/episode_rewards_mc_screen.csv', index_label="Episode")

with open('../agents/mc-agent-screen.pkl', 'wb') as f:
    pickle.dump(trained_agent, f)

In [None]:
# Calculate the rolling mean
rolling_mean = episode_rewards_df['Reward'].rolling(window=500).mean()

# Plot the rolling mean of reward evolution during training
plt.figure(figsize=(10, 6))
sns.lineplot(data=rolling_mean, label='500 Episode Rolling Mean')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Reward Evolution During Training (500 Episode Rolling Mean)')
plt.legend()
plt.savefig('../reports/figures/reward_evolution_rolling_mean_mc_screen.png')
plt.show()

### **Text Flappy Bird Dist Env**

#### *Sensitivity analysis*
Conducted over 1000 episodes, the goal is to check the sensitivity of different hyperparameters to be able to find optimal ones.

In [None]:
gammas = [0.8, 0.85, 0.9, 0.95, 1.0]  # Gamma values
epsilons = [0.1, 0.2, 0.3, 0.4, 0.5]  # Epsilon values
performance_metrics = np.zeros((len(gammas), len(epsilons)))

In [None]:
for i, gamma in enumerate(gammas):
    for j, epsilon in enumerate(epsilons):
        trained_agent, episode_rewards = train_monte_carlo_agent('TextFlappyBird-v0', episodes=5000, gamma=gamma, epsilon=epsilon)
        total_reward = test_agent(trained_agent, episodes=100)
        performance_metrics[i, j] = total_reward

In [None]:
# Plotting
plt.figure(figsize=(10, 8))
sns.heatmap(performance_metrics, xticklabels=epsilons, yticklabels=gammas, annot=True, cmap="YlGnBu")
plt.xlabel('Epsilon')
plt.ylabel('Gamma')
plt.title('Agent Performance for Different Gamma and Epsilon Values')

plot_file_path = "../reports/figures/sensitivity_analysis_mc_dist.png"
plt.savefig(plot_file_path)

#### *Best params training*

In [None]:
# Get best hyper param epsilon and gamma
# Find the position of the maximum value in the performance metrics matrix
max_value_index = np.unravel_index(performance_metrics.argmax(), performance_metrics.shape)

# Retrieve the corresponding epsilon and gamma values
best_epsilon = epsilons[max_value_index[1]]  # Column index for epsilon
best_gamma = gammas[max_value_index[0]]  # Row index for gamma

# Train
trained_agent, episode_rewards = train_monte_carlo_agent('TextFlappyBird-v0', episodes=25000, gamma=best_gamma, epsilon=best_epsilon)

# Save agent and episode reward history
episode_rewards_df = pd.DataFrame(episode_rewards, columns=["Reward"])
episode_rewards_df.to_csv('../saves/episode_rewards_mc_dist.csv', index_label="Episode")

with open('../agents/mc-agent-dist.pkl', 'wb') as f:
    pickle.dump(trained_agent, f)

In [None]:
# Calculate the rolling mean
rolling_mean = episode_rewards_df['Reward'].rolling(window=500).mean()

# Plot the rolling mean of reward evolution during training
plt.figure(figsize=(10, 6))
sns.lineplot(data=rolling_mean, label='500 Episode Rolling Mean')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Reward Evolution During Training (500 Episode Rolling Mean)')
plt.legend()
plt.savefig('../reports/figures/reward_evolution_rolling_mean_mc_dist.png')
plt.show()

### **Test of trained agent on different level configuration (check overfitting)**

#### *Test on screen agent*

In [None]:
# Load the trained agent
with open('../agents/mc-agent-screen.pkl', 'rb') as f:
    trained_agent_screen = pickle.load(f)

# Define the configurations to test
configurations = [
    {'height': 15, 'width': 20, 'pipe_gap': 4},
    {'height': 20, 'width': 25, 'pipe_gap': 7},
    {'height': 25, 'width': 33, 'pipe_gap': 10}
]

# Placeholder for average rewards
average_rewards = []

# Test the agent on different configurations and collect average rewards
for config in configurations:
    print(f"Testing on configuration: {config}")
    env = gym.make('TextFlappyBird-screen-v0', height=config['height'], width=config['width'], pipe_gap=config['pipe_gap'])
    trained_agent_screen.env = env
    avg_reward = test_agent(trained_agent_screen, episodes=100, reward_threshold=10000)
    average_rewards.append(avg_reward)
    print(f"Average reward for configuration {config}: {avg_reward}\n")

In [None]:
# Create labels for the x-axis
config_labels = [f"Height: {config['height']}, Width: {config['width']}, Pipe Gap: {config['pipe_gap']}" for config in configurations]

In [None]:
# Create a histogram plot
plt.figure(figsize=(10, 6))
plt.bar(config_labels, average_rewards)
plt.xlabel('Configuration')
plt.ylabel('Average Reward')
plt.title('Average Reward for Different Configurations')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../reports/figures/average_reward_histogram_mc_screen.png')
plt.show()

#### *Test on dist agent*

In [None]:
# Load the trained agent
with open('../agents/mc-agent-dist.pkl', 'rb') as f:
    trained_agent_dist = pickle.load(f)

# Define the configurations to test
configurations = [
    {'height': 15, 'width': 20, 'pipe_gap': 4},
    {'height': 20, 'width': 25, 'pipe_gap': 7},
    {'height': 25, 'width': 33, 'pipe_gap': 10}
]

# Placeholder for average rewards
average_rewards = []

# Test the agent on different configurations and collect average rewards
for config in configurations:
    print(f"Testing on configuration: {config}")
    env = gym.make('TextFlappyBird-v0', height=config['height'], width=config['width'], pipe_gap=config['pipe_gap'])
    trained_agent_dist.env = env
    avg_reward = test_agent(trained_agent_dist, episodes=100, reward_threshold=10000)
    average_rewards.append(avg_reward)
    print(f"Average reward for configuration {config}: {avg_reward}\n")

In [None]:
# Create labels for the x-axis
config_labels = [f"Height: {config['height']}, Width: {config['width']}, Pipe Gap: {config['pipe_gap']}" for config in configurations]

In [None]:
# Create a histogram plot
plt.figure(figsize=(10, 6))
plt.bar(config_labels, average_rewards)
plt.xlabel('Configuration')
plt.ylabel('Average Reward')
plt.title('Average Reward for Different Configurations')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../reports/figures/average_reward_histogram_mc_dist.png')
plt.show()

---

## **Sarsa based agent**

In [None]:
class SarsaLambdaAgent:
    def __init__(self, env, gamma=0.9, lambda_=0.9, epsilon=0.1, alpha=0.5):
        self.env = env
        self.gamma = gamma
        self.lambda_ = lambda_
        self.epsilon = epsilon
        self.alpha = alpha
        self.Q = defaultdict(self.zero_action_value)
        self.E = defaultdict(self.zero_action_value)

    def zero_action_value(self):
        return np.zeros(self.env.action_space.n)

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.Q[state])

    def update(self, state, action, reward, next_state, next_action, done):
        delta = reward + self.gamma * self.Q[next_state][next_action] * (not done) - self.Q[state][action]
        self.E[state][action] += 1

        for s, values in self.Q.items():
            for a in range(len(values)):
                self.Q[s][a] += self.alpha * delta * self.E[s][a]
                self.E[s][a] *= self.gamma * self.lambda_

    def reset_eligibility_traces(self):
        for s in self.E:
            for a in range(len(self.E[s])):
                self.E[s][a] = 0

    def train(self, num_episodes):
        for i_episode in range(num_episodes):
            state = self.env.reset()
            action = self.choose_action(str(state))
            self.reset_eligibility_traces()
            done = False

            while not done:
                next_state, reward, done, _, _ = self.env.step(action)
                next_action = self.choose_action(str(next_state))
                self.update(str(state), action, reward, str(next_state), next_action, done)
                state = next_state
                action = next_action

In [None]:
def train_sarsa_lambda_agent(env_str, episodes=1000, gamma=0.9, lambda_=0.9, epsilon=0.1, alpha=0.5):
    env = gym.make(env_str, height=15, width=20, pipe_gap=4)
    agent = SarsaLambdaAgent(env, gamma=gamma, lambda_=lambda_, epsilon=epsilon, alpha=alpha)
    
    episode_rewards = []

    for _ in tqdm(range(episodes), desc="Training process"):
        state = env.reset()
        action = agent.choose_action(str(state))
        agent.reset_eligibility_traces()
        total_reward = 0
        done = False

        while not done:
            next_state, reward, done, _, _ = env.step(action)
            next_action = agent.choose_action(str(next_state))
            
            # Update Q-values based on the transition
            agent.update(str(state), action, reward, str(next_state), next_action, done)
            
            state = next_state
            action = next_action
            total_reward += reward
        
        episode_rewards.append(total_reward)
    
    print("Training completed.")
    return agent, episode_rewards

In [None]:
def test_sarsa_lambda_agent(agent, episodes=100, reward_threshold=10000):
    total_rewards = 0
    episode_scores = []  # To store the score of each episode

    for episode_num in tqdm(range(1, episodes + 1), desc="Testing episodes"):
        state = agent.env.reset()
        done = False
        episode_reward = 0

        while not done:
            action = agent.choose_action(str(state))  # Directly choose action from agent's policy
            
            next_state, reward, done, _, info = agent.env.step(action)
            episode_reward += reward
            state = next_state  # Update the state

            # Check if the reward threshold for this episode has been exceeded
            if episode_reward > reward_threshold:
                break

        # Episode is done or threshold exceeded, append its total reward to episode_scores
        episode_scores.append(episode_reward)
        total_rewards += episode_reward
    
    avg_reward = total_rewards / episodes
    print("Total reward across all episodes: ", total_rewards)
    print(f"Average Reward over {episodes} episodes: {avg_reward}")
    return avg_reward

### **Text Flappy Bird Screen env**

#### *Sensitivity analysis*
Conducted over 1000 episodes, the goal is to check the sensitivity of different hyperparameters to be able to find optimal ones.

In [None]:
gammas = [0.8, 1.0]
epsilons = [0.1, 0.5]
lambdas = [0.8, 1.0]
alphas = [0.1, 0.5]

# Create a list of all possible combinations of hyperparameters
param_combinations = list(itertools.product(gammas, epsilons, lambdas, alphas))

In [None]:
# Initialize a list to store the performance results
performance_metrics = []

for gamma, epsilon, lambda_, alpha in param_combinations:
    trained_agent, episode_rewards = train_sarsa_lambda_agent('TextFlappyBird-screen-v0', episodes=5000, gamma=gamma, epsilon=epsilon, lambda_=lambda_, alpha=alpha)
    total_reward = test_sarsa_lambda_agent(trained_agent, episodes=100)
    performance_metrics.append([gamma, epsilon, lambda_, alpha, total_reward])

In [None]:
# Create a DataFrame from the results
df = pd.DataFrame(performance_metrics, columns=['Gamma', 'Epsilon', 'Lambda', 'Alpha', 'Total Reward'])

# Pivot the DataFrame to create a multi-index for the heatmap
pivot_df = df.pivot_table(index=['Gamma', 'Lambda'], columns=['Epsilon', 'Alpha'], values='Total Reward')

plt.figure(figsize=(10, 8))
sns.heatmap(pivot_df, annot=True, cmap="YlGnBu")
plt.xlabel('Epsilon - Alpha')
plt.ylabel('Gamma - Lambda')
plt.title('Agent Performance for Different Hyperparameter Combinations')
plot_file_path = "../reports/figures/sensitivity_analysis_sarsa_screen.png"
plt.savefig(plot_file_path)

#### *Best params training*

In [None]:
# Get best hyper param epsilon and gamma
# Find the row with the maximum total reward
best_params_row = df.loc[df['Total Reward'].idxmax()]

# Extract the best hyperparameters
best_gamma = best_params_row['Gamma']
best_epsilon = best_params_row['Epsilon']
best_lambda = best_params_row['Lambda']
best_alpha = best_params_row['Alpha']

# Train
trained_agent, episode_rewards = train_sarsa_lambda_agent('TextFlappyBird-screen-v0', episodes=25000, gamma=best_gamma, epsilon=best_epsilon, lambda_=best_lambda, alpha=best_alpha)

# Save agent and episode reward history
episode_rewards_df = pd.DataFrame(episode_rewards, columns=["Reward"])
episode_rewards_df.to_csv('../saves/episode_rewards_sarsa_screen.csv', index_label="Episode")

with open('../agents/sarsa-agent-screen.pkl', 'wb') as f:
    pickle.dump(trained_agent, f)

In [None]:
# Calculate the rolling mean
rolling_mean = episode_rewards_df['Reward'].rolling(window=500).mean()

# Plot the rolling mean of reward evolution during training
plt.figure(figsize=(10, 6))
sns.lineplot(data=rolling_mean, label='500 Episode Rolling Mean')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Reward Evolution During Training (500 Episode Rolling Mean)')
plt.legend()
plt.savefig('../reports/figures/reward_evolution_rolling_mean_sarsa_screen.png')
plt.show()

### **Text Flappy Bird dist env**

#### *Sensitivity analysis*
Conducted over 1000 episodes, the goal is to check the sensitivity of different hyperparameters to be able to find optimal ones.

In [None]:
gammas = [0.8, 1.0]
epsilons = [0.1, 0.5]
lambdas = [0.8, 1.0]
alphas = [0.1, 0.5]

# Create a list of all possible combinations of hyperparameters
param_combinations = list(itertools.product(gammas, epsilons, lambdas, alphas))

In [None]:
# Initialize a list to store the performance results
performance_metrics = []

for gamma, epsilon, lambda_, alpha in param_combinations:
    trained_agent, episode_rewards = train_sarsa_lambda_agent('TextFlappyBird-v0', episodes=5000, gamma=gamma, epsilon=epsilon, lambda_=lambda_, alpha=alpha)
    total_reward = test_sarsa_lambda_agent(trained_agent, episodes=100)
    performance_metrics.append([gamma, epsilon, lambda_, alpha, total_reward])

In [None]:
# Create a DataFrame from the results
df = pd.DataFrame(performance_metrics, columns=['Gamma', 'Epsilon', 'Lambda', 'Alpha', 'Total Reward'])

# Pivot the DataFrame to create a multi-index for the heatmap
pivot_df = df.pivot_table(index=['Gamma', 'Lambda'], columns=['Epsilon', 'Alpha'], values='Total Reward')

plt.figure(figsize=(10, 8))
sns.heatmap(pivot_df, annot=True, cmap="YlGnBu")
plt.xlabel('Epsilon - Alpha')
plt.ylabel('Gamma - Lambda')
plt.title('Agent Performance for Different Hyperparameter Combinations')
plot_file_path = "../reports/figures/sensitivity_analysis_sarsa_dist.png"
plt.savefig(plot_file_path)

In [None]:
# Get best hyper param epsilon and gamma
# Find the row with the maximum total reward
best_params_row = df.loc[df['Total Reward'].idxmax()]

# Extract the best hyperparameters
best_gamma = best_params_row['Gamma']
best_epsilon = best_params_row['Epsilon']
best_lambda = best_params_row['Lambda']
best_alpha = best_params_row['Alpha']

# Train
trained_agent, episode_rewards = train_sarsa_lambda_agent('TextFlappyBird-v0', episodes=25000, gamma=best_gamma, epsilon=best_epsilon, lambda_=best_lambda, alpha=best_alpha)

# Save agent and episode reward history
episode_rewards_df = pd.DataFrame(episode_rewards, columns=["Reward"])
episode_rewards_df.to_csv('../saves/episode_rewards_sarsa_dist.csv', index_label="Episode")

with open('../agents/sarsa-agent-dist.pkl', 'wb') as f:
    pickle.dump(trained_agent, f)

In [None]:
# Calculate the rolling mean
rolling_mean = episode_rewards_df['Reward'].rolling(window=500).mean()

# Plot the rolling mean of reward evolution during training
plt.figure(figsize=(10, 6))
sns.lineplot(data=rolling_mean, label='500 Episode Rolling Mean')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Reward Evolution During Training (500 Episode Rolling Mean)')
plt.legend()
plt.savefig('../reports/figures/reward_evolution_rolling_mean_sarsa_dist.png')
plt.show()

### **Test of trained agent on different level configuration (check overfitting)**

#### *Test on screen agent*

In [None]:
# Load the trained agent
with open('../agents/sarsa-agent-screen.pkl', 'rb') as f:
    trained_agent_screen = pickle.load(f)

# Define the configurations to test
configurations = [
    {'height': 15, 'width': 20, 'pipe_gap': 4},
    {'height': 20, 'width': 25, 'pipe_gap': 7},
    {'height': 25, 'width': 33, 'pipe_gap': 10}
]

# Placeholder for average rewards
average_rewards = []

# Test the agent on different configurations and collect average rewards
for config in configurations:
    print(f"Testing on configuration: {config}")
    env = gym.make('TextFlappyBird-screen-v0', height=config['height'], width=config['width'], pipe_gap=config['pipe_gap'])
    trained_agent_screen.env = env
    avg_reward = test_sarsa_lambda_agent(trained_agent_screen, episodes=100, reward_threshold=10000)
    average_rewards.append(avg_reward)
    print(f"Average reward for configuration {config}: {avg_reward}\n")

In [None]:
# Create labels for the x-axis
config_labels = [f"Height: {config['height']}, Width: {config['width']}, Pipe Gap: {config['pipe_gap']}" for config in configurations]

In [None]:
# Create a histogram plot
plt.figure(figsize=(10, 6))
plt.bar(config_labels, average_rewards)
plt.xlabel('Configuration')
plt.ylabel('Average Reward')
plt.title('Average Reward for Different Configurations')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../reports/figures/average_reward_histogram_sarsa_screen.png')
plt.show()

#### *Test on dist agent*

In [None]:
# Load the trained agent
with open('../agents/sarsa-agent-dist.pkl', 'rb') as f:
    trained_agent_dist = pickle.load(f)

# Define the configurations to test
configurations = [
    {'height': 15, 'width': 20, 'pipe_gap': 4},
    {'height': 20, 'width': 25, 'pipe_gap': 7},
    {'height': 25, 'width': 33, 'pipe_gap': 10}
]

# Placeholder for average rewards
average_rewards = []

# Test the agent on different configurations and collect average rewards
for config in configurations:
    print(f"Testing on configuration: {config}")
    env = gym.make('TextFlappyBird-v0', height=config['height'], width=config['width'], pipe_gap=config['pipe_gap'])
    trained_agent_dist.env = env
    avg_reward = test_sarsa_lambda_agent(trained_agent_dist, episodes=100, reward_threshold=10000)
    average_rewards.append(avg_reward)
    print(f"Average reward for configuration {config}: {avg_reward}\n")

In [None]:
# Create labels for the x-axis
config_labels = [f"Height: {config['height']}, Width: {config['width']}, Pipe Gap: {config['pipe_gap']}" for config in configurations]

In [None]:
# Create a histogram plot
plt.figure(figsize=(10, 6))
plt.bar(config_labels, average_rewards)
plt.xlabel('Configuration')
plt.ylabel('Average Reward')
plt.title('Average Reward for Different Configurations')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../reports/figures/average_reward_histogram_sarsa_dist.png')
plt.show()