In [None]:
import gym
import numpy as np

# Create the environment
env = gym.make('Blackjack-v1')

# Set the parameters
num_episodes = 100000
gamma = 1.0

# Initialize the empty dictionaries to store state-action values and visit counts
Q = {}
N = {}

# Function to choose an action based on epsilon-greedy strategy
def choose_action(state):
    if state not in Q:
        return env.action_space.sample()
    else:
        return np.argmax(Q[state])

# Monte Carlo algorithm
for episode in range(num_episodes):
    states = []
    actions = []
    rewards = []

    state = env.reset()

    while True:
        action = choose_action(state)
        next_state, reward, done, _ = env.step(action)

        states.append(state)
        actions.append(action)
        rewards.append(reward)

        state = next_state

        if done:
            break

    G = 0  # Initialize the return

    # Update the state-action values and visit counts
    for t in range(len(states)-1, -1, -1):
        G = gamma * G + rewards[t]
        state = states[t]
        action = actions[t]

        if (state, action) not in N:
            N[(state, action)] = 0

        N[(state, action)] += 1

        if (state, action) not in Q:
            Q[(state, action)] = 0

        Q[(state, action)] += (1 / N[(state, action)]) * (G - Q[(state, action)])

# Print the state-action values
for state, action in Q.keys():
    print(f"State: {state}, Action: {action}, Value: {Q[(state, action)]}")

# Close the environment
env.close()


State: (20, 10, False), Action: 0, Value: 0.44165907019143125
State: (13, 4, False), Action: 0, Value: -0.12254901960784312
State: (19, 10, False), Action: 1, Value: -0.8582089552238806
State: (14, 10, False), Action: 1, Value: -0.6970297029702981
State: (20, 1, False), Action: 0, Value: 0.19111969111969107
State: (16, 1, False), Action: 1, Value: -0.8381201044386419
State: (15, 1, False), Action: 1, Value: -0.8083989501312331
State: (12, 6, False), Action: 0, Value: -0.18867924528301896
State: (19, 6, True), Action: 1, Value: -0.3508771929824561
State: (8, 6, False), Action: 1, Value: -0.2972972972972971
State: (20, 8, False), Action: 1, Value: -0.9392857142857133
State: (21, 8, True), Action: 1, Value: -0.20960698689956325
State: (21, 6, False), Action: 0, Value: 0.9186602870813393
State: (15, 6, False), Action: 1, Value: -0.5954198473282447
State: (21, 6, True), Action: 1, Value: -0.2660550458715597
State: (12, 8, False), Action: 0, Value: -0.48557692307692335
State: (9, 8, False), 

In [None]:
import gym
import numpy as np

# Create the FrozenLake environment
env = gym.make('FrozenLake-v1')

# Set the parameters
num_episodes = 10000
gamma = 0.99

# Initialize the empty dictionaries to store state-action values and visit counts
Q = {}
N = {}

# Monte Carlo algorithm
for episode in range(num_episodes):
    states = []
    actions = []
    rewards = []

    state = env.reset()

    done = False
    while not done:
        action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)

        states.append(state)
        actions.append(action)
        rewards.append(reward)

        state = next_state

    G = 0  # Initialize the return
    for t in range(len(states) - 1, -1, -1):
        G = gamma * G + rewards[t]

        state = states[t]
        action = actions[t]

        if (state, action) not in N:
            N[(state, action)] = 0

        N[(state, action)] += 1

        if (state, action) not in Q:
            Q[(state, action)] = 0

        Q[(state, action)] += (1 / N[(state, action)]) * (G - Q[(state, action)])

# Print the learned state-action values
for state, action in Q.keys():
    print(f"State: {state}, Action: {action}, Value: {Q[(state, action)]}")

# Close the environment
env.close()


State: 1, Action: 2, Value: 0.008042321718259608
State: 0, Action: 3, Value: 0.013109587399375178
State: 0, Action: 0, Value: 0.012548235929129693
State: 3, Action: 1, Value: 0.005987136468356283
State: 2, Action: 3, Value: 0.010572354132968847
State: 2, Action: 0, Value: 0.02183708321879
State: 1, Action: 3, Value: 0.011685856269280193
State: 0, Action: 1, Value: 0.011906478930320662
State: 4, Action: 1, Value: 0.014907840993790552
State: 0, Action: 2, Value: 0.01009991398312058
State: 1, Action: 0, Value: 0.00863851732950864
State: 8, Action: 1, Value: 0.03722576028986145
State: 4, Action: 0, Value: 0.015387590107003236
State: 4, Action: 3, Value: 0.010414073585582126
State: 4, Action: 2, Value: 0.013724280975637666
State: 1, Action: 1, Value: 0.010256090001256195
State: 8, Action: 2, Value: 0.026636567833353067
State: 8, Action: 0, Value: 0.014483908722205903
State: 6, Action: 2, Value: 0.04009662354275013
State: 6, Action: 1, Value: 0.03948671174369622
State: 6, Action: 3, Value: 0

In [None]:
import gym
import numpy as np

env=gym.make('FrozenLake-v1')

epsilon=0.1
gamma=0.99
episodes=10000

Q={}
N={}

for episode in range(episodes):
  states=[]
  actions=[]
  rewards=[]
  state=env.reset()

  done=False

  while not done:
    action=env.action_space.sample()
    next_state,reward,done,_=env.step(action)

    states.append(state)
    actions.append(action)
    rewards.append(reward)
    state=next_state

  G=0
  for t in range(len(states)-1,-1,-1):
    G=gamma*G+rewards[t]

    state=states[t]
    action=actions[t]

    if (state,action) not in N:
      N[(state,action)]=0
    N[(state,action)]+=1

    if (state,action) not in Q:
      Q[(state,action)]=0
    Q[(state,action)]+=(G-Q[(state,action)])/(N[(state,action)])


for state,action in Q.keys():
  print(f"State:{state} ; Action:{action} ; Value:{Q[(state,action)]}")






  deprecation(
  deprecation(


State:4 ; Action:2 ; Value:0.014507711570958275
State:0 ; Action:0 ; Value:0.012016500625617843
State:1 ; Action:0 ; Value:0.007739403245410946
State:0 ; Action:1 ; Value:0.01078948593959164
State:1 ; Action:3 ; Value:0.010545946629754263
State:0 ; Action:3 ; Value:0.012923602132687973
State:4 ; Action:0 ; Value:0.02261725096471672
State:1 ; Action:2 ; Value:0.010531846726736188
State:1 ; Action:1 ; Value:0.00922486546807176
State:2 ; Action:3 ; Value:0.011182780073846618
State:3 ; Action:0 ; Value:0.007118025610461714
State:3 ; Action:2 ; Value:0.0035060857335718156
State:3 ; Action:3 ; Value:0.007785036392969471
State:2 ; Action:1 ; Value:0.01377632456408947
State:2 ; Action:0 ; Value:0.019525597330225308
State:6 ; Action:3 ; Value:0.009457917497526037
State:2 ; Action:2 ; Value:0.024027643626709765
State:0 ; Action:2 ; Value:0.013740684518598483
State:6 ; Action:1 ; Value:0.052277460073409034
State:10 ; Action:3 ; Value:0.0336933773716209
State:6 ; Action:0 ; Value:0.053723354512643