In [1]:
!pip install git+https://gitlab-research.centralesupelec.fr/stergios.christodoulidis/text-flappy-bird-gym.git

Collecting git+https://gitlab-research.centralesupelec.fr/stergios.christodoulidis/text-flappy-bird-gym.git
  Cloning https://gitlab-research.centralesupelec.fr/stergios.christodoulidis/text-flappy-bird-gym.git to c:\users\guezz\appdata\local\temp\pip-req-build-toxzd403
  Resolved https://gitlab-research.centralesupelec.fr/stergios.christodoulidis/text-flappy-bird-gym.git to commit ca2797e9270195313423324c9d0f205f6cbb3d28
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://gitlab-research.centralesupelec.fr/stergios.christodoulidis/text-flappy-bird-gym.git 'C:\Users\guezz\AppData\Local\Temp\pip-req-build-toxzd403'


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os, sys
import gymnasium as gym
import time
import text_flappy_bird_gym

# SARSA($\lambda$) from the book

We approximate $q(s,a) = \hat{q}(s,a,w) = w(a)^T s$. The vector $s \in \mathbb{R}^2$ contains the state, and w if a set of two $\mathbb{R}^2$ vectors, one for each action
We also have to keep a short memory 3D vector $z$.  
We will use the true online SARSA algorithm.

In [10]:
class SarsaLambdaAgent:
    def __init__(self, env, alpha=0.1, gamma=0.99, epsilon=0.1, lambda_=0.9):
        self.env = env
        self.infos = []

        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.lambda_ = lambda_  # Eligibility trace decay rate

        # Two weight vectors, one for each action (state dim = 2)
        self.w = np.zeros((2, 2))  # (num_actions, num_features)
        self.z = np.zeros((2, 2))  # Eligibility trace matrix

    def choose_action(self, state):
        """Epsilon-greedy action selection"""
        if np.random.rand() < self.epsilon:
            return self.env.action_space.sample()
        else:
            # Compute Q-values for both actions
            q_values = [np.dot(self.w[a], state) for a in range(2)]
            return np.argmax(q_values)  # Choose action with highest Q-value

    def training_loop(self, num_episodes=10):
        for _ in range(num_episodes):
            state, info = self.env.reset()
            state = np.array(state).reshape(-1)

            action = int(np.random.rand() < 0.5)  # Random initial action
            done = False
            self.z.fill(0)  # Reset eligibility traces at start of each episode
            Q_old = 0  # Initialize Q_old

            while not done:
                # Take action and observe reward
                next_state, reward, done, _, info = self.env.step(action)
                next_state = np.array(next_state).reshape(-1)

                # Choose next action (ε-greedy)
                next_action = self.choose_action(next_state)

                # Compute Q-values for current and next state-action pair
                Q = np.dot(self.w[action], state)
                Q_next = np.dot(self.w[next_action], next_state)

                # Calculate the temporal difference error (δ)
                delta = reward + self.gamma * Q_next - Q
                print(delta)

                # Eligibility trace update (action-specific)
                self.z[action] = self.gamma * self.lambda_ * self.z[action] + \
                      (1 - self.alpha * self.gamma * self.lambda_ * np.dot(self.z[action], state)) * state

                # True Online SARSA(λ) weight update
                self.w[action] += self.alpha * (delta + Q - Q_old) * self.z[action] - self.alpha * (Q - Q_old) * state

                # Update Q_old for next iteration
                Q_old = Q_next

                # Move to the next state and action
                state = next_state
                action = next_action

            self.env.close()


In [12]:
env1 = gym.make('TextFlappyBird-v0', height=15, width=20, pipe_gap=5)
obs = env1.reset()

agent = SarsaLambdaAgent(env=env1, alpha = 0.001, gamma = 0.98, epsilon=0.1, lambda_ = 0) 

agent.training_loop(num_episodes=10**4)  # Test with 10 episodes
print("Training completed. Final weights:")
print(agent.w)

1.0
0.98984
0.9929498703999999
0.977218375728768
1.535935097020647
0.9637359409942288
0.9890443130736499
1.0196020083452033
1.5448524238338837
0.9313604996423401
0.9535551988611344
0.962465654314685
1.8111461378252982
0.8808475743739257
0.2556962260399396
1.5308338107384216
0.8731972186267252
0.9137184603516597
0.894756664233815
0.8196142173985905
0.8828032643294099
0.9518403309567098
-0.3107255237138058
1.8492589936683963
0.8327871689725677
0.8902863284533029
0.9339435695111646
0.7520000992089741
0.8004246595686659
0.8358531616234042
0.7738952975167326
2.462381766284933
0.7342050653040357
0.7779219820414189
0.8250187439054621
2.3210696475947
0.7075398952589715
0.7579842930513121
-0.3989002572962277
0.6572885794830414
0.7053922619285795
0.7501141712893062
0.6939202797427528
0.6192512805986983
0.6704503619574327
0.7104322903318163
0.6531716654805284
2.840442142104541
0.6106264118269458
0.6576743228531408
0.7072838098900998
0.5528067171529019
0.6065581049615201
0.6570030475664819
0.60088

KeyboardInterrupt: 

In [None]:
plt.plot([x['distance'] for x in agent.infos])
plt.show()

In [None]:
plt.plot([x['score'] for x in agent.infos])

In [None]:
gym.make('TextFlappyBird-v0', height=15, width=20, pipe_gap=5)