In [4]:
from datasets import load_dataset

import torch
import numpy as np
import pandas as pd
import random

import matplotlib.pyplot as plt
from pprint import pprint
from tournamentgym import TournamentEnv

In [5]:
from Pearl.pearl.pearl_agent import PearlAgent
from Pearl.pearl.action_representation_modules.one_hot_action_representation_module import (
    OneHotActionTensorRepresentationModule,
)
from Pearl.pearl.policy_learners.sequential_decision_making.deep_q_learning import (
    DeepQLearning,
)
from Pearl.pearl.replay_buffers.sequential_decision_making.fifo_off_policy_replay_buffer import (
    FIFOOffPolicyReplayBuffer,
)
from Pearl.pearl.utils.instantiations.environments.gym_environment import GymEnvironment


In [6]:
from Pearl.pearl.neural_networks.sequential_decision_making.q_value_networks import VanillaQValueNetwork
from Pearl.pearl.utils.functional_utils.experimentation.set_seed import set_seed
from Pearl.pearl.policy_learners.sequential_decision_making.deep_q_learning import DeepQLearning
from Pearl.pearl.policy_learners.sequential_decision_making.double_dqn import DoubleDQN
from Pearl.pearl.replay_buffers.sequential_decision_making.fifo_off_policy_replay_buffer import FIFOOffPolicyReplayBuffer
from Pearl.pearl.utils.functional_utils.train_and_eval.online_learning import online_learning
from Pearl.pearl.pearl_agent import PearlAgent
from Pearl.pearl.utils.instantiations.environments.gym_environment import GymEnvironment
from Pearl.pearl.action_representation_modules.one_hot_action_representation_module import (
    OneHotActionTensorRepresentationModule,
)

In [8]:

team_data=pd.read_csv('Process_data/M_pm_w_names.csv')
env=TournamentEnv(season=2022,team_stats=team_data,discrete=True,shuffle=True,exclude_seasons=[2023,2024],reward_on_round_end=False,loading_bar=False)
env=GymEnvironment(env)
num_actions = env.action_space.n
# VanillaQValueNetwork class uses a simple mlp for approximating the Q values.
#  - Input dimension of the mlp = (state_dim + action_dim)
#  - Size of the intermediate layers are specified as list of `hidden_dims`.
hidden_dims = [64, 64]
env.action_space

<pearl.utils.instantiations.spaces.discrete_action.DiscreteActionSpace at 0x2929da9bd90>

In [9]:
# Example 2: Set up a simple Double DQN agent
# Set up a different instance of a Q value network.

# We will be using a one hot representation for representing actions. So take action_dim = num_actions.

Q_value_network = VanillaQValueNetwork(state_dim=env.observation_space.shape[0],  # dimension of the state representation
                                       action_dim=num_actions,                    # dimension of the action representation
                                       hidden_dims=hidden_dims,                   # dimensions of the intermediate layers
                                       output_dim=1)         

In [10]:
# Instead of using the 'network_type' argument, use the 'network_instance' argument.
# Pass Q_value_network as the `network_instance` to the `DeepQLearning` policy learner.
DQNagent = PearlAgent(
    policy_learner=DeepQLearning(
        state_dim=env.observation_space.shape[0],
        action_space=env.action_space,
        batch_size=64,
        training_rounds=10,
        soft_update_tau=0.001,

        network_instance=Q_value_network, # pass an instance of Q value network to the policy learner.
        action_representation_module=OneHotActionTensorRepresentationModule(
            max_number_actions=num_actions
        ),
    ),
    replay_buffer=FIFOOffPolicyReplayBuffer(10_000),
)

In [11]:
info = online_learning(
    agent=DQNagent,
    env=env,
    number_of_episodes=200,
    print_every_x_episodes=100,   # print returns after every 10 episdoes
    learn_after_episode=False,    # instead of updating after every environment interaction, Q networks are updates at the end of each episode
    seed=0
)

torch.save(info["return"], "DQN-return.pt")   # info["return"] refers to the episodic returns
plt.plot(np.arange(len(info["return"])), info["return"], label="DQN")
plt.title("Episodic returns")
plt.xlabel("Episode")
plt.ylabel("Return")
plt.legend()
plt.show()

In [None]:
test_env=TournamentEnv(season=2024,
                  exclude_seasons=[c for c in range(2003,2023)],
                  team_stats=team_data,
                  shuffle=False,
                  discrete=True,
                  reward_on_round_end=False,
                  loading_bar=False
                  )
test_env=GymEnvironment(test_env)

In [None]:
agent=DQNagent
observation, action_space = env.reset(season=2024)
agent.reset(observation, action_space)
done = False
while not done:
    action = agent.act(exploit=False)
    action_result = env.step(action)
    agent.observe(action_result)
    agent.learn()
    done = action_result.done

In [None]:
env.render()

In [None]:
# Set up a different instance of a Q value network.

# We will be using a one hot representation for representing actions. So take action_dim = num_actions.
Q_network_DoubleDQN = VanillaQValueNetwork(state_dim=env.observation_space.shape[0],  # dimension of the state representation
                                       action_dim=num_actions,                        # dimension of the action representation
                                       hidden_dims=hidden_dims,                       # dimensions of the intermediate layers
                                       output_dim=1)  
# Instead of using the 'network_type' argument, use the 'network_instance' argument.
# Pass Q_value_network as the `network_instance` to the `DoubleDQN` policy learner.
DoubleDQNagent = PearlAgent(
    policy_learner=DoubleDQN(
        state_dim=env.observation_space.shape[0],
        action_space=env.action_space,
        batch_size=64,
        training_rounds=10,
        soft_update_tau=0.75,
        network_instance=Q_network_DoubleDQN,   # pass an instance of Q value network to the policy learner.
        action_representation_module=OneHotActionTensorRepresentationModule(
            max_number_actions=num_actions
        ),
    ),
    replay_buffer=FIFOOffPolicyReplayBuffer(10_000),
)  

In [None]:
# The online learning function in Pearl implements environment interaction and learning
# and returns a dictionary with episodic returns

info_DoubleDQN = online_learning(
    agent=DoubleDQNagent,
    env=env,
    number_of_episodes=2000,
    print_every_x_episodes=100,   #  print returns after every 10 episdoes
    learn_after_episode=True,    # instead of updating after every environment interaction, Q networks are updates at the end of each episode
    seed=0
)

torch.save(info_DoubleDQN["return"], "DoubleDQN-return.pt")   # info["return"] refers to the episodic returns
plt.plot(np.arange(len(info_DoubleDQN["return"])), info_DoubleDQN["return"], label="DoubleDQN")
plt.title("Episodic returns")
plt.xlabel("Episode")
plt.ylabel("Return")
plt.legend()
plt.show()

In [None]:
agent=DoubleDQNagent
observation, action_space = env.reset()
agent.reset(observation, action_space)
done = False
while not done:
    
    action = agent.act(exploit=True)
    action_result = env.step(action)
    agent.observe(action_result)
    agent.learn()
    done = action_result.done

In [None]:
env.render()