In [16]:
import gymnasium as gym
import numpy as np
from collections import defaultdict
import time
import tqdm
from IPython.display import Image
import imageio

In [17]:
class PongEnv(gym.Env):
    def __init__(self, name, render_mode='rgb_array') -> None:
        super(PongEnv, self).__init__()
        self.env = gym.make(name, render_mode=render_mode)
        self.total_timesteps = 0
        self.memory = []
        self.action_space = self.env.action_space
    
    def take_step(self, agent, action):
        self.total_timesteps += 1
            
        next_frame, next_frames_reward, next_frame_terminal, truncated, info = self.env.step(action)
        
        # return next_state, reward, done, _ 
        return next_frame, next_frames_reward, next_frame_terminal, truncated
    
    def reset(self) -> np.ndarray:
        self.env.reset()
    
    def _get_obs(self) -> np.ndarray:
        return self.env._get_obs()

#### Methoden

In [18]:
def MonteCarlo(agent, episode):
    G=0
    visited_state_actions = set()  # To keep track of state-action pairs we've seen

    for t in reversed(range(len(episode))):
        state, action, reward = episode[t]
        state_str = str(state)
        G = agent.gamma * G + reward

        # Check if the state-action pair is visited for the first time in this episode
        if not (state_str, action) in visited_state_actions:
            visited_state_actions.add((state_str, action))  # Mark this state-action as visited
            agent.returns[(state_str, action)].append(G)
            agent.Q[state_str][action] = np.mean(agent.returns[(state_str, action)])
    return agent

def Q_Algorithm(agent, episode):
    G=0
    score = 0
    for state, action, reward in reversed(episode):
        state_str = str(state)
        score += reward
        G = agent.gamma * G + reward  # Calculate the cumulative reward
        # Q-learning update
        agent.Q[state_str][action] += agent.alpha * (G - agent.Q[state_str][action])
    return agent

In [19]:
class PongAgent:
    def __init__(self, env, gamma=0.9, alpha=0.1, epsilon=1., epsilon_min=0.01, epsilon_decay=0.995):
        self.env = env
        self.gamma = gamma  # Discount factor for future rewards
        self.alpha = alpha  # Learning rate
        self.epsilon = epsilon  # Epsilon-greedy exploration parameter
        self.epsilon_min = epsilon_min  # Minimum epsilon
        self.epsilon_decay = epsilon_decay
        self.Q = defaultdict(lambda: np.zeros(self.env.action_space.n))
        self.returns = defaultdict(list)

    def policy(self, state):
        # Epsilon-greedy policy
        if np.random.rand() < self.epsilon:
            return self.env.action_space.sample()  # Explore by selecting a random action
        else:
            state_str = str(state)
            return np.argmax(self.Q[state_str])  # Exploit by selecting the best action

    def generate_episode(self):
        episode = []
        state = self.env.reset()
        done = False

        while not done:
            action = self.policy(state)
            next_state, reward, done, _ = self.env.take_step(self, action)
            episode.append((state, action, reward))
            state = next_state

        return episode

    def update(self, algorithm='Q-Learning'):
        episode = self.generate_episode()
        
        if algorithm == 'Q-Learning':
            self = Q_Algorithm(self, episode)
        elif algorithm == 'MonteCarlo':
            self = MonteCarlo(self, episode)
        else:
            raise Exception('Algorithm not implemented')
        # Decay ε after each episode
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)


    def train(self, num_episodes, algorithm='Q-Learning'):
        for ep in tqdm.tqdm(range(num_episodes), desc='Training', unit='ep'):
            self.update(algorithm=algorithm)
        return self.Q
            

    def get_best_action(self, state):
        state_str = str(state)
        return np.argmax(self.Q[state_str])


In [20]:
env = PongEnv('ALE/Pong-v5', render_mode='rgb_array')
agt = PongAgent(env)
theAgentQ = agt.train(20000, algorithm='MonteCarlo')

Training:   0%|          | 0/20000 [00:00<?, ?ep/s]

Training: 100%|██████████| 20000/20000 [3:39:20<00:00,  1.52ep/s]  


In [21]:
def render_episode(self, output_file):
    episode = []
    state = self.env.reset()
    done = False
    actions=[]
    frames = []  # To store frames for the GIF
    while not done:
        action = self.policy(state)
        actions.append(action)
        next_state, reward, done, _ = self.env.take_step(self, action)
        episode.append((state, action, reward))
        state = next_state

        # Render from state
        frame = self.env.env.render()
        frames.append(frame)

    # Save frames as a GIF
    imageio.mimsave(output_file, frames, duration=1 / 60)  # Assuming 30 FPS
    print('Saved the GIF: {}'.format(output_file))
    print(f'Total Steps: {len(actions)}')
    for i in range(3):
        print(f'Action {i}: {actions.count(i)} = {actions.count(i)/len(actions)}%')
    return episode

In [22]:
episode = render_episode(agt, 'pong_episode.gif')

  logger.warn(


Saved the GIF: pong_episode.gif
Total Steps: 764
Action 0: 0 = 0.0%
Action 1: 1 = 0.0013089005235602095%
Action 2: 762 = 0.9973821989528796%


In [23]:
agt.Q

defaultdict(<function __main__.PongAgent.__init__.<locals>.<lambda>()>,
            {'[[[  0   0   0]\n  [  0   0   0]\n  [  0   0   0]\n  ...\n  [144  72  17]\n  [144  72  17]\n  [144  72  17]]\n\n [[144  72  17]\n  [144  72  17]\n  [144  72  17]\n  ...\n  [144  72  17]\n  [144  72  17]\n  [144  72  17]]\n\n [[144  72  17]\n  [144  72  17]\n  [144  72  17]\n  ...\n  [144  72  17]\n  [144  72  17]\n  [144  72  17]]\n\n ...\n\n [[236 236 236]\n  [236 236 236]\n  [236 236 236]\n  ...\n  [236 236 236]\n  [236 236 236]\n  [236 236 236]]\n\n [[236 236 236]\n  [236 236 236]\n  [236 236 236]\n  ...\n  [236 236 236]\n  [236 236 236]\n  [236 236 236]]\n\n [[236 236 236]\n  [236 236 236]\n  [236 236 236]\n  ...\n  [236 236 236]\n  [236 236 236]\n  [236 236 236]]]': array([-0.43206023, -0.43206888, -0.43205258, -0.43206178, -0.43207946,
                    -0.43205683]),
             'None': array([-0.00130111, -0.00129296, -0.00130422, -0.00117634, -0.00126609,
                    -0.00104134])}

In [24]:
theQ

defaultdict(<function __main__.PongAgent.__init__.<locals>.<lambda>()>,
            {'[[[  0   0   0]\n  [  0   0   0]\n  [  0   0   0]\n  ...\n  [144  72  17]\n  [144  72  17]\n  [144  72  17]]\n\n [[144  72  17]\n  [144  72  17]\n  [144  72  17]\n  ...\n  [144  72  17]\n  [144  72  17]\n  [144  72  17]]\n\n [[144  72  17]\n  [144  72  17]\n  [144  72  17]\n  ...\n  [144  72  17]\n  [144  72  17]\n  [144  72  17]]\n\n ...\n\n [[236 236 236]\n  [236 236 236]\n  [236 236 236]\n  ...\n  [236 236 236]\n  [236 236 236]\n  [236 236 236]]\n\n [[236 236 236]\n  [236 236 236]\n  [236 236 236]\n  ...\n  [236 236 236]\n  [236 236 236]\n  [236 236 236]]\n\n [[236 236 236]\n  [236 236 236]\n  [236 236 236]\n  ...\n  [236 236 236]\n  [236 236 236]\n  [236 236 236]]]': array([-0.25730191, -0.23752578, -0.00953974, -0.25206889, -0.28119456,
                    -0.2589139 ]),
             'None': array([-0.00134365, -0.00134365, -0.00134365, -0.00134365, -0.00134365,
                    -0.00134365])}

defaultdict(<function __main__.PongAgent.__init__.<locals>.<lambda>()>,
            {'[[[  0   0   0]\n  [  0   0   0]\n  [  0   0   0]\n  ...\n  [144  72  17]\n  [144  72  17]\n  [144  72  17]]\n\n [[144  72  17]\n  [144  72  17]\n  [144  72  17]\n  ...\n  [144  72  17]\n  [144  72  17]\n  [144  72  17]]\n\n [[144  72  17]\n  [144  72  17]\n  [144  72  17]\n  ...\n  [144  72  17]\n  [144  72  17]\n  [144  72  17]]\n\n ...\n\n [[236 236 236]\n  [236 236 236]\n  [236 236 236]\n  ...\n  [236 236 236]\n  [236 236 236]\n  [236 236 236]]\n\n [[236 236 236]\n  [236 236 236]\n  [236 236 236]\n  ...\n  [236 236 236]\n  [236 236 236]\n  [236 236 236]]\n\n [[236 236 236]\n  [236 236 236]\n  [236 236 236]\n  ...\n  [236 236 236]\n  [236 236 236]\n  [236 236 236]]]': array([-0.25730191, -0.23752578, -0.00953974, -0.25206889, -0.28119456,
                    -0.2589139 ]),
             'None': array([-0.00134365, -0.00134365, -0.00134365, -0.00134365, -0.00134365,
                    -0.00134365])})

Using Q-learning for a Pong environment is possible, but it might not be the most effective approach. Q-learning is a model-free reinforcement learning algorithm that can work well for simple environments with a relatively small state and action space. Pong, on the other hand, is a more complex environment with a large state space due to the continuous nature of the screen pixels.

In Q-learning, you maintain a Q-table that stores the expected cumulative rewards for each state-action pair. In the case of Pong, the state space would be huge because it would need to include all the possible screen configurations, which makes Q-learning impractical in this scenario. This table would be too large to store and update efficiently.