<a href="https://colab.research.google.com/github/shazam-25/Simplify_ML/blob/main/MDP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np

# Simple MDP setup
states = [0,1,2]  # Set of states S
actions = [0,1]   # Set of actions A
gamma = 0.9       # Discount factor

# Transition probabilities: P[s,a,s']
P = np.array([
    [[0.8, 0.2, 0.0], [0.0, 1.0, 0.0]],   # From state 0
    [[0.0, 0.9, 0.1], [0.0, 0.0, 1.0]],   # From state 1
    [[0.0, 0.0, 1.0], [0.0, 0.0, 1.0]]    # From state 2
])

# Rewards: R[s,a]
R = np.array([
    [5, 10],    # State 0
    [-1, 2],    # State 1
    [0, 0]      # State 2
])

# Value Iteration
V = np.zeros(len(states))
num_iterations = 10

for i in range(num_iterations):
  V_new = np.zeros_like(V)
  for s in states:
    V_new[s] = max(sum(P[s,a,s_next]*(R[s,a]+gamma*V[s_next]) for s_next in states) for a in actions)
    V = V_new
  print(f'Iteration {i+1}: Value function = {V}')

# Compute optimal policy
policy = np.zeros(len(states), dtype=int)
for s in states:
  policy[s] = np.argmax([sum(P[s,a,s_next]*(R[s,a]+gamma*V[s_next]) for s_next in states) for a in actions])

print('\nOptimal value function:', V)
print('Optimal policy (action per state):', policy)

Iteration 1: Value function = [10.  2.  0.]
Iteration 2: Value function = [12.56  2.    0.  ]
Iteration 3: Value function = [14.4032  2.      0.    ]
Iteration 4: Value function = [15.730304  2.        0.      ]
Iteration 5: Value function = [16.68581888  2.          0.        ]
Iteration 6: Value function = [17.37378959  2.          0.        ]
Iteration 7: Value function = [17.86912851  2.          0.        ]
Iteration 8: Value function = [18.22577253  2.          0.        ]
Iteration 9: Value function = [18.48255622  2.          0.        ]
Iteration 10: Value function = [18.66744048  2.          0.        ]

Optimal value function: [18.66744048  2.          0.        ]
Optimal policy (action per state): [0 1 0]
