# Reinforcement Learning - Lab 1
### J. Martinet

### 1) Install Gymnasium

See the page [https://gymnasium.farama.org](Gymnasium).
Check [https://github.com/Farama-Foundation/Gymnasium]() for installation procedure.

Check your installation of gymnasium:

In [1]:
try:
    import gymnasium as gym
except:
    !pip3 install gymnasium
    import gymnasium as gym

Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
   ---------------------------------------- 0.0/958.1 kB ? eta -:--:--
    -------------------------------------- 20.5/958.1 kB 640.0 kB/s eta 0:00:02
   - ------------------------------------- 41.0/958.1 kB 487.6 kB/s eta 0:00:02
   -- ------------------------------------ 61.4/958.1 kB 544.7 kB/s eta 0:00:02
   -- ------------------------------------ 61.4/958.1 kB 544.7 kB/s eta 0:00:02
   -- ------------------------------------ 71.7/958.1 kB 326.8 kB/s eta 0:00:03
   ----- -------------------------------- 143.4/958.1 kB 566.5 kB/s eta 0:00:02
   ------ ------------------------------- 153.6/958.1 kB 573.4 kB/s eta 0:00:02
   ------- ------------------------------ 194.6/958.1 kB 588.9 kB/s eta 0:00:02
   -------- --


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


If there is time, start implementing TicTacToe with 1/2 player(s)

In [1]:
# your code here
import gymnasium as gym


In [1]:
import numpy as np
import random

class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.done = False
        self.winner = None

    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.done = False
        self.winner = None
        return self.board

    def available_actions(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]

    def step(self, action, player):
        if self.board[action] != 0:
            raise ValueError("Invalid action!")
        self.board[action] = player

        # Check for a winner
        for i in range(3):
            if all(self.board[i, :] == player) or all(self.board[:, i] == player):
                self.done = True
                self.winner = player
                return self.board, 1 if player == 1 else -1, self.done

        if all(self.board.diagonal() == player) or all(np.fliplr(self.board).diagonal() == player):
            self.done = True
            self.winner = player
            return self.board, 1 if player == 1 else -1, self.done

        if len(self.available_actions()) == 0:  # Draw
            self.done = True
            return self.board, 0, self.done

        return self.board, 0, self.done

class QLearningAgent:
    def __init__(self, player, learning_rate=0.1, discount_factor=0.9, epsilon=0.1):
        self.q_table = {}
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.player = player

    def get_state_key(self, state):
        return tuple(state.flatten())

    def choose_action(self, state, available_actions):
        state_key = self.get_state_key(state)
        if np.random.rand() < self.epsilon or state_key not in self.q_table:
            return random.choice(available_actions)

        q_values = self.q_table[state_key]
        max_q = max(q_values.values())
        best_actions = [action for action, q in q_values.items() if q == max_q]
        return random.choice(best_actions)

    def update_q_value(self, state, action, reward, next_state):
        state_key = self.get_state_key(state)
        next_state_key = self.get_state_key(next_state)

        if state_key not in self.q_table:
            self.q_table[state_key] = {action: 0 for action in [(i, j) for i in range(3) for j in range(3)]}

        if next_state_key not in self.q_table:
            self.q_table[next_state_key] = {action: 0 for action in [(i, j) for i in range(3) for j in range(3)]}

        current_q = self.q_table[state_key][action]
        max_next_q = max(self.q_table[next_state_key].values())

        self.q_table[state_key][action] = current_q + self.learning_rate * (
            reward + self.discount_factor * max_next_q - current_q
        )

# Training the agents
game = TicTacToe()
agent1 = QLearningAgent(player=1)
agent2 = QLearningAgent(player=-1)

for episode in range(10000):
    state = game.reset()
    while not game.done:
        # Player 1's turn
        action = agent1.choose_action(state, game.available_actions())
        next_state, reward, done = game.step(action, player=1)
        agent1.update_q_value(state, action, reward, next_state)
        state = next_state
        if done:
            agent2.update_q_value(state, action, -reward, next_state)
            break

        # Player 2's turn
        action = agent2.choose_action(state, game.available_actions())
        next_state, reward, done = game.step(action, player=-1)
        agent2.update_q_value(state, action, reward, next_state)
        state = next_state
        if done:
            agent1.update_q_value(state, action, -reward, next_state)
            break

print("Training complete!")


Training complete!


Try and implement a simple time difference update of p(win).
Remember that the general definition of the TD update rule is:

$$ V(s_t) \leftarrow V(s_t) + \alpha[ V(s_{t+1}) - V(s_{t}] $$

In [2]:
# your code here

import numpy as np
import random

class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.done = False
        self.winner = None

    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.done = False
        self.winner = None
        return self.board

    def available_actions(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]

    def step(self, action, player):
        if self.board[action] != 0:
            raise ValueError("Invalid action!")
        self.board[action] = player

        # Check for a winner
        for i in range(3):
            if all(self.board[i, :] == player) or all(self.board[:, i] == player):
                self.done = True
                self.winner = player
                return self.board, 1 if player == 1 else -1, self.done

        if all(self.board.diagonal() == player) or all(np.fliplr(self.board).diagonal() == player):
            self.done = True
            self.winner = player
            return self.board, 1 if player == 1 else -1, self.done

        if len(self.available_actions()) == 0:  # Draw
            self.done = True
            return self.board, 0, self.done

        return self.board, 0, self.done

class TDAgent:
    def __init__(self, player, learning_rate=0.1):
        self.values = {}
        self.learning_rate = learning_rate
        self.player = player

    def get_state_key(self, state):
        return tuple(state.flatten())

    def get_value(self, state):
        state_key = self.get_state_key(state)
        return self.values.get(state_key, 0.5)  # Default value of 0.5 for unknown states

    def choose_action(self, state, available_actions):
        return random.choice(available_actions)

    def update_value(self, state, next_state):
        state_key = self.get_state_key(state)
        next_state_key = self.get_state_key(next_state)

        current_value = self.values.get(state_key, 0.5)
        next_value = self.values.get(next_state_key, 0.5)

        self.values[state_key] = current_value + self.learning_rate * (next_value - current_value)

# Training the agents
game = TicTacToe()
agent1 = TDAgent(player=1)
agent2 = TDAgent(player=-1)

for episode in range(10000):
    state = game.reset()
    while not game.done:
        # Player 1's turn
        action = agent1.choose_action(state, game.available_actions())
        next_state, reward, done = game.step(action, player=1)
        agent1.update_value(state, next_state)
        state = next_state
        if done:
            break

        # Player 2's turn
        action = agent2.choose_action(state, game.available_actions())
        next_state, reward, done = game.step(action, player=-1)
        agent2.update_value(state, next_state)
        state = next_state
        if done:
            break

print("Training complete!")


Training complete!


In [3]:
# Re-import necessary libraries since the code environment has reset
import numpy as np

# Given data points
t_values = np.array([1, 2, 4])
y_values = np.array([-0.44796, -0.56015, -5.09196])

# Compute first derivatives (finite differences)
dy_dt = np.diff(y_values) / np.diff(t_values)

# Compute second derivatives (finite differences)
d2y_dt2 = np.diff(dy_dt) / np.diff(t_values[:-1])

# Assign derivatives to their corresponding time points
dy_dt_values = np.array([dy_dt[0], dy_dt[1]])  # dy/dt at t=1, t=2
d2y_dt2_values = np.array([d2y_dt2[0]])  # d2y/dt2 at t=1

# Display computed derivatives
dy_dt_values, d2y_dt2_values


(array([-0.11219 , -2.265905]), array([-2.153715]))

In [18]:
import numpy as np
import random                                    
board = np.array([['-', '-', '-'],
                  ['-', '-', '-'],
                  ['-', '-', '-']])             
players = ['X', 'O']                             
num_players = len(players)
Q = {}    

In [19]:
learning_rate = 0.001
discount_factor = 0.9
exploration_rate = 0.5
num_episodes = 10000

In [20]:
def print_board(board):
    for row in board:
        print('  |  '.join(row))
        print('---------------')
print_board(board)


# Function to convert the board state to a string to use it as a key in the Q-table dictionary.
def board_to_string(board):
    return ''.join(board.flatten())
board_to_string(board)


#defining action as a cell randomly selected from the empty cells
empty_cells = np.argwhere(board == '-')
action = tuple(random.choice(empty_cells))
print(action)

-  |  -  |  -
---------------
-  |  -  |  -
---------------
-  |  -  |  -
---------------
(2, 2)


In [21]:
# Function to check if the game is over by checking different winning condition

def is_game_over(board):

    # Check rows for winning condition
    for row in board:
        if len(set(row)) == 1 and row[0] != '-':        #len(set(row)) == 1 -> check if all elements in row are same and  none of the cell is empty
            return True, row[0]


    # Check columns
    for col in board.T:                                 #iterate over clms of transponse of board
        if len(set(col)) == 1 and col[0] != '-':
            return True, col[0]


    # Check diagonals
    if len(set(board.diagonal())) == 1 and board[0, 0] != '-':             #check all elements in main diagonal are same and non empty
        return True, board[0, 0]
    if len(set(np.fliplr(board).diagonal())) == 1 and board[0, 2] != '-':   #horizontal flip the board and check...
        return True, board[0, 2]


    # Check if the board is full
    if '-' not in board:
        return True, 'draw'

    return False, None

In [22]:
# Function to choose an action based on the Q-table

#Random exploration condition in the choose_action function checks whether agent should perform a random exploration or not or if current state is not present in the Q-table
#if random exploration is choosen,
#a random action is chosen from the available empty cells on the board.
# This promotes exploration and allows the agent to try out different actions and gather more information about the environment.


#if exploitation is choosen,
#the function selects the action with the highest Q-value from the available empty cells.
#and do action - > update it with player symbol (X or O according to player[])

def choose_action(board, exploration_rate):
    state = board_to_string(board)

    # Exploration-exploitation trade-off
    if random.uniform(0, 1) < exploration_rate or state not in Q:
        # Choose a random action
        empty_cells = np.argwhere(board == '-')
        action = tuple(random.choice(empty_cells))
    else:
        # Choose the action with the highest Q-value
        q_values = Q[state]
        empty_cells = np.argwhere(board == '-')                                    #returns indices of the empty cells in the board.
        empty_q_values = [q_values[cell[0], cell[1]] for cell in empty_cells]      #retrieves Q-values corresponding to each empty cells.
        max_q_value = max(empty_q_values)                                          #find the maximum Q-value among the empty cells Qvalue
        max_q_indices = [i for i in range(len(empty_cells)) if empty_q_values[i] == max_q_value]    #retrieves the indices of empty cells that have the maximum Q-value.
        max_q_index = random.choice(max_q_indices)                                 #if there are multiple cells with same maximum Q value select 1 randomly
        action = tuple(empty_cells[max_q_index])                                   #retrieves the indices of the selected empty cell based on max_q_index

    return action

In [23]:
#  convert the cell coordinates (row and column) of the chosen action to the next state of the board as a string.

def board_next_state(cell):
    next_state = board.copy()                      #create a copy of current board state
    next_state[cell[0], cell[1]] = players[0]
    return next_state

In [24]:
# Function to update the Q-table
agent_wins = 0
# def update_q_table(state, action, next_state, reward):
#     q_values = Q.get(state, np.zeros((3, 3)))                               #Retrieve the Q-values for a particular state from the Q-table dictionary Q.
#     next_q_values = Q.get(board_to_string(next_state), np.zeros((3, 3)))       # Calculate the maximum Q-value for the next state from q table
#     max_next_q_value = np.max(next_q_values)                                #find maxmium q values from q values of nxt state



#     # Q-learning update equation
#     q_values[action[0], action[1]] += learning_rate * (reward + discount_factor * max_next_q_value - q_values[action[0], action[1]])
# #Q-learning update equation calculates the new Q-value for the current state-action pair based on the immediate reward, the discounted future rewards, and the current Q-value.
# #By subtracting the current Q-value from the estimated total reward, it calculates the temporal difference (TD) error, which represents the discrepancy between the expected reward and the actual reward.


# #The new Q-value is obtained by updating the current Q-value using the TD error, the learning rate, and the discount factor. This update process helps the Q-values to gradually converge towards the optimal values, reflecting the expected long-term rewards for each state-action pair.
#     Q[state] = q_values

def update_q_table(state, action, next_state, reward):
    q_values = Q.get(state, np.zeros((3, 3)))

    # Calculate the maximum Q-value for the next state
    next_q_values = Q.get(board_to_string(next_state), np.zeros((3, 3)))
    max_next_q_value = np.max(next_q_values)

    # Q-learning update equation
    q_values[action[0], action[1]] += learning_rate * (reward + discount_factor * max_next_q_value - q_values[action[0], action[1]])

    Q[state] = q_values

# Main Q-learning algorithm
for episode in range(num_episodes):
    board = np.array([['-', '-', '-'],
                      ['-', '-', '-'],
                      ['-', '-', '-']])

    current_player = random.choice(players)
    game_over = False

    while not game_over:
        # Choose an action based on the current state
        action = choose_action(board, exploration_rate)

        # Make the chosen move
        row, col = action
        board[row, col] = current_player

        # Check if the game is over
        game_over, winner = is_game_over(board)

        if game_over:
            # Update the Q-table with the final reward
            if winner == current_player:
                reward = 1
            elif winner == 'draw':
                reward = 0.5
            else:
                reward = 0
            update_q_table(board_to_string(board), action, board, reward)
        else:
            # Switch to the next player
            current_player = players[(players.index(current_player) + 1) % num_players]

        # Update the Q-table based on the immediate reward and the next state
        if not game_over:
            next_state = board_next_state(action)
            update_q_table(board_to_string(board), action, next_state, 0)

    # Decay the exploration rate
    exploration_rate *= 0.99

# Play against the trained agent
board = np.array([['-', '-', '-'],
                  ['-', '-', '-'],
                  ['-', '-', '-']])

current_player = random.choice(players)
game_over = False

# ...

while not game_over:
    if current_player == 'X':
        # Human player's turn
        print_board(board)
        row = int(input("Enter the row (0-2): "))
        col = int(input("Enter the column (0-2): "))
        action = (row, col)
    else:
        # Trained agent's turn
        action = choose_action(board, exploration_rate=0)

    row, col = action
    board[row, col] = current_player

    game_over, winner = is_game_over(board)

    if game_over:
        print_board(board)
        if winner == 'X':
            print("Human player wins!")
        elif winner == 'O':
            print("Agent wins!")
        else:
            print("It's a draw!")
    else:
        current_player = players[(players.index(current_player) + 1) % num_players]

#agent_win_percentage = (agent_wins / num_games) * 100
#print("Agent win percentage: {:.2f}%".format(agent_win_percentage))

O  |  -  |  -
---------------
-  |  -  |  -
---------------
-  |  -  |  -
---------------
Enter the row (0-2): 0
Enter the column (0-2): 2
O  |  -  |  X
---------------
-  |  O  |  -
---------------
-  |  -  |  -
---------------
Enter the row (0-2): 2
Enter the column (0-2): 2
O  |  -  |  X
---------------
-  |  O  |  -
---------------
-  |  O  |  X
---------------
Enter the row (0-2): 0
Enter the column (0-2): 1
O  |  X  |  X
---------------
-  |  O  |  -
---------------
O  |  O  |  X
---------------
Enter the row (0-2): 1
Enter the column (0-2): 2
O  |  X  |  X
---------------
-  |  O  |  X
---------------
O  |  O  |  X
---------------
Human player wins!


In [29]:
# # Main Q-learning algorithm
# num_draws = 0  # Counter for the number of draws
# agent_wins = 0  # Counter for the number of wins by the agent

# for episode in range(num_episodes):
#     board = np.array([['-', '-', '-'],
#                       ['-', '-', '-'],
#                       ['-', '-', '-']])

#     current_player = random.choice(players)  # Randomly choose the current player
#     game_over = False

#     while not game_over:
#         action = choose_action(board, exploration_rate)  # Choose an action using the exploration rate

#         row, col = action
#         board[row, col] = current_player  # Update the board with the current player's move

#         game_over, winner = is_game_over(board)  # Check if the game is over and determine the winner

#         if game_over:
#             if winner == current_player:  # Agent wins
#                 reward = 1
#                 agent_wins += 1
#             elif winner == 'draw':  # Game ends in a draw
#                 reward = 0
#                 num_draws += 1
#             else:  # Agent loses
#                 reward = -1
#             update_q_table(board_to_string(board), action, board, reward)  # Update the Q-table
#         else:
#             current_player = players[(players.index(current_player) + 1) % num_players]  # Switch to the next player

#         if not game_over:
#             next_state = board_next_state(action)
#             update_q_table(board_to_string(board), action, next_state, 0)  # Update the Q-table with the next state

#     exploration_rate *= 0.99  # Decrease the exploration rate over time

# # Play multiple games between the trained agent and itself
# agent_win_percentage = (agent_wins / num_games) * 100
# draw_percentage = (num_draws / num_games) * 100

# print("Agent win percentage: {:.2f}%".format(agent_win_percentage))
# print("Draw percentage: {:.2f}%".format(draw_percentage))

In [33]:
import random
AGENT = 1
OPPONENT = -1
NO_PLAYER = 0

In [34]:
class Game:    
    def __init__(self, game_state=None):
        if game_state is None:
            game_state = [
                0, 0, 0,
                0, 0, 0,
                0, 0, 0
            ]
        self.state = game_state
    
    def __str__(self):
        return str(self.state)

    def is_draw(self):
        return len([field for field in self.state if field == NO_PLAYER]) == 0

    def is_finished(self):
        return self.get_winner() != NO_PLAYER or self.is_draw()

    def valid_moves(self):
        return [i for i in range(9) if self.state[i] == NO_PLAYER]

    def make_move(self, field, player):
        next = list(self.state)
        next[field] = player
        return Game(next)

    def get_winner(self):
        state = self.state
        for i in range(3):
            if state[i * 3] == state[i * 3 + 1] == state[i * 3 + 2] == state[i * 3] != NO_PLAYER:
                return state[i * 3]
            if state[i] == state[i + 3] == state[i + 6] == state[i] != NO_PLAYER:
                return state[i]
            if state[0] == state[4] == state[8] == state[0] != NO_PLAYER:
                return state[0]
            if state[2] == state[4] == state[6] == state[2] != NO_PLAYER:
                return state[2]

        return NO_PLAYER

In [35]:
def play_games(policy, opponent_policy, num_games=100):
    games_won = 0
    draw = 0
    # Play games
    for i in range(num_games):
        game = Game()
        # 50% chance opponent starts
        if random.random() > 0.5:
            game = game.make_move(opponent_policy(game), OPPONENT)

        while not game.is_finished():
            # First players turn
            game = game.make_move(policy(game), AGENT)
            if game.is_finished():
                break
            # Other players turn
            game = game.make_move(opponent_policy(game), OPPONENT)

        if game.get_winner() == 0:
            draw = draw + 1
        if game.get_winner() > 0:
            games_won = games_won + 1

    return games_won, draw

In [36]:
def reward(game):
    return max(game.get_winner(), 0)

In [37]:
class ValuePolicy:
    DEFAULT_VALUE = 0.5

    def __init__(self):
        self.values = {}

    def policy(self, game):
        move_values = {}
        moves = game.valid_moves()
        for move in moves:
            next = game.make_move(move, AGENT)
            move_values[move] = self.get_state_value(next)

        return max(move_values, key=move_values.get)

    def get_state_value(self, state):
        if str(state) not in self.values:
            return self.DEFAULT_VALUE

        return self.values[str(state)]

    def set_state_value(self, state, value):
        self.values[str(state)] = value

    def learn(self, states):
        # Actually perform the learning
        def temporal_difference(current_state_value, next_state_value):
            learning_rate = 0.1
            return current_state_value + learning_rate * (next_state_value - current_state_value)

        last_state = states[-1:][0]
        last_value = reward(last_state)
        self.set_state_value(last_state, last_value)
        # Got through every state from end to start
        for state in reversed(states[:-1]):
            value = self.get_state_value(state)
            last_value = temporal_difference(value, last_value)
            self.set_state_value(state, last_value)

In [38]:
def random_policy(game):
    return random.choice(game.valid_moves())

In [39]:
def train(policy, opponent_policy, training_games=1000):
    for i in range(training_games):
        game = Game()
        states = []

        # 50% chance opponent starts
        if random.random() > 0.5:
            game = game.make_move(opponent_policy(game), OPPONENT)

        while not game.is_finished():
            # Our agent makes a move
            # but occasionally we make a random choice
            if random.random() < 0.5:
                game = game.make_move(random_policy(game), AGENT)
            else:
                game = game.make_move(policy.policy(game), AGENT)
            states.append(game)

            if game.is_finished():
                break

            game = game.make_move(opponent_policy(game), OPPONENT)
            states.append(game)

        policy.learn(states)

In [43]:
policy = ValuePolicy()

train(policy, random_policy, training_games=1000)

games_to_play = 1000
games_won, draw = play_games(policy.policy, random_policy, games_to_play)

print("Games played: %s" % games_to_play)
print("Games won: %s" % games_won)
print("Draw: %s" % draw)

Games played: 1000
Games won: 744
Draw: 61
