In [1]:
class MDP:
    def __init__(self, states, actions, transitions, rewards, gamma=0.9):
        """
        Initialize the MDP.
        :param states: List of all states in the MDP.
        :param actions: List of all actions in the MDP.
        :param transitions: A dictionary where keys are tuples of (state, action) and values are lists of tuples (next_state, probability).
        :param rewards: A dictionary where keys are tuples of (state, action, next_state) and values are the rewards for taking the action in the state and ending up in next_state.
        :param gamma: Discount factor (default is 0.9).
        """
        self.states = states
        self.actions = actions
        self.transitions = transitions
        self.rewards = rewards
        self.gamma = gamma

    def get_possible_actions(self, state):
        """Return a list of possible actions in a given state."""
        return [action for (s, action) in self.transitions.keys() if s == state]

    def get_next_states_and_probs(self, state, action):
        """Return a list of next states and their transition probabilities given a state and action."""
        return self.transitions.get((state, action), [])

    def get_reward(self, state, action, next_state):
        """Return the reward for a transition from state to next_state using action."""
        return self.rewards.get((state, action, next_state), 0)

    def is_terminal(self, state):
        """Check if a state is terminal (i.e., has no actions available)."""
        return len(self.get_possible_actions(state)) == 0


# Example usage
states = ['s1', 's2', 's3']
actions = ['a1', 'a2']
transitions = {
    ('s1', 'a1'): [('s2', 0.8), ('s3', 0.2)],
    ('s1', 'a2'): [('s3', 1.0)],
    ('s2', 'a1'): [('s1', 1.0)],
    ('s2', 'a2'): [('s3', 1.0)],
}
rewards = {
    ('s1', 'a1', 's2'): 5,
    ('s1', 'a1', 's3'): 10,
    ('s1', 'a2', 's3'): 2,
    ('s2', 'a1', 's1'): -1,
    ('s2', 'a2', 's3'): 0,
}

mdp = MDP(states, actions, transitions, rewards)

state = 's1'
action = 'a1'
next_states_and_probs = mdp.get_next_states_and_probs(state, action)
reward = mdp.get_reward(state, action, 's2')

print(f"Next states and probabilities from state {state} taking action {action}: {next_states_and_probs}")
print(f"Reward for moving from {state} to 's2' using {action}: {reward}")


Next states and probabilities from state s1 taking action a1: [('s2', 0.8), ('s3', 0.2)]
Reward for moving from s1 to 's2' using a1: 5
