<a href="https://colab.research.google.com/github/sahasubhajit/2048-game-in-c/blob/main/semi_gradient_sarsa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [None]:
import gymnasium as gym
import numpy as np
import math
import copy

env = gym.make('MountainCar-v0')

In [None]:
pos_space = np.linspace(-1.2, 0.6, 18)
vel_space = np.linspace(-0.07, 0.07, 28)

def getState(observation):
    pos, vel = observation
    pos_bin = np.digitize(pos, pos_space)
    vel_bin = np.digitize(vel, vel_space)

    return (pos_bin, vel_bin)

state = [0.9, 0.07]
getState(state)

(18, 28)

In [None]:
observation, info = env.reset()

for _ in range(1000):
    action = env.action_space.sample()  # agent policy that uses the observation and info
    observation, reward, terminated, truncated, info = env.step(action)
    print('State {}, Reward {}, Terminated {}, Truncated {}'.format(observation, reward, terminated, truncated))

    if terminated or truncated:
        observation, info = env.reset()
        print(" ############## Reset to the state ! ############# {}".format(observation))

State [-0.47593728  0.00064857], Reward -1.0, Terminated False, Truncated False
State [-0.47464496  0.00129233], Reward -1.0, Terminated False, Truncated False
State [-0.47371846  0.00092649], Reward -1.0, Terminated False, Truncated False
State [-0.47316468  0.00055378], Reward -1.0, Terminated False, Truncated False
State [-4.7298771e-01  1.7696962e-04], Reward -1.0, Terminated False, Truncated False
State [-0.47418886 -0.00120116], Reward -1.0, Terminated False, Truncated False
State [-0.47575924 -0.00157038], Reward -1.0, Terminated False, Truncated False
State [-0.47768718 -0.00192794], Reward -1.0, Terminated False, Truncated False
State [-0.47895837 -0.00127119], Reward -1.0, Terminated False, Truncated False
State [-0.4815634  -0.00260499], Reward -1.0, Terminated False, Truncated False
State [-0.4844828  -0.00291942], Reward -1.0, Terminated False, Truncated False
State [-0.48769492 -0.00321212], Reward -1.0, Terminated False, Truncated False
State [-0.4901758  -0.00248088], R

In [None]:
class sg_sarsa:

  def __init__(self, environment, learning_rate = 1, discount_factor = 0.9, episodes = 1, \
               epsilon = 0.9999, track_reward = True, visualize_qvalue = True):
    '''
    '''
    self.env = environment
    self.initial_state, info = self.env.reset()
    self.learning_rate = learning_rate
    self.epsilon = epsilon
    self.gamma = discount_factor
    self.max_episode = episodes
    self.initial_weight = np.zeros(( self.env.action_space.n, self.env.observation_space.shape[0]))
    self.weight = self.initial_weight
    self.pos_space = np.linspace(-1.2, 0.6, 18)
    self.vel_space = np.linspace(-0.07, 0.07, 28)

  def basis_value(self, state):

    pos, vel = state
    pos_bin = np.digitize(pos, self.pos_space)
    vel_bin = np.digitize(vel, self.vel_space)

    return np.array([pos_bin, vel_bin])


  def draw_action(self, state):
    basis_vector = self.basis_value(state)
    if np.random.uniform() < self.epsilon:
      action = np.random.randint(1, self.env.action_space.n + 1)
    else:
      basis_vector = self.basis_value(state)
      state_act_values = np.zeros(self.env.action_space.n)
      for i in range(self.env.action_space.n):
        #print(basis_vector.shape, self.weight.shape)
        state_act_values[i] = np.dot(basis_vector, self.weight[i])
      action = np.argmax(state_act_values) + 1
    return action


  def run(self):
    weight = self.initial_weight
    state = self.initial_state
    action = self.draw_action(state)
    #print(state, action)

    current_episode = 0
    episode_reward = 0
    step = 0
    while current_episode < self.max_episode:
      #action = self.draw_action(state)
      next_state, reward, terminated, truncated, info = self.env.step(action-1)
      episode_reward += reward
      step += 1
      #print(reward)
      #terminated = True
      if step == 100:
        truncated = True
      else:
        truncated = False
      if terminated or truncated:
        weight[action -1] = weight[action -1] + self.learning_rate*(reward - np.dot(self.basis_value(state), self.weight[action - 1]))*state
        #print(weight)
        state, info = env.reset()
        action = self.draw_action(state)
        print(" ########################### Episode {} has ended with total reward {} for terminated = {} truncated = {} Epsilon = {} ######################## ".format(current_episode, episode_reward, terminated, truncated, self.epsilon))
        if current_episode%100 == 0:
          print(weight)
        current_episode += 1
        episode_reward = 0
        step = 0
        self.epsilon = max(self.epsilon - current_episode/(10*self.max_episode), 0.1)
      else:
        next_action = self.draw_action(next_state)
        print('Previous weight', weight)
        print("Reward ##### Next Action #### Next State #### Q(S*, A*) #### Q(S, A) #### S")
        print(reward,  next_action, self.basis_value(next_state), self.gamma*np.dot(self.basis_value(next_state), self.weight[next_action - 1]), np.dot(self.basis_value(state), self.weight[action - 1]), self.basis_value(state))
        weight[action -1] = weight[action -1] + self.learning_rate*(reward + self.gamma*np.dot(self.basis_value(next_state), self.weight[next_action - 1])\
                                                                    - np.dot(self.basis_value(state), self.weight[action - 1]))*self.basis_value(next_state)
        print('present weight', weight)
        state = copy.copy(next_state)
        action = copy.copy(next_action)

In [None]:
SARSA = sg_sarsa(env)
SARSA.run()

Previous weight [[0. 0.]
 [0. 0.]
 [0. 0.]]
Reward ##### Next Action #### Next State #### Q(S*, A*) #### Q(S, A) #### S
-1.0 1 [ 6 14] 0.0 0.0 [ 6 14]
present weight [[ -6. -14.]
 [  0.   0.]
 [  0.   0.]]
Previous weight [[ -6. -14.]
 [  0.   0.]
 [  0.   0.]]
Reward ##### Next Action #### Next State #### Q(S*, A*) #### Q(S, A) #### S
-1.0 2 [ 6 14] 0.0 -232.0 [ 6 14]
present weight [[1380. 3220.]
 [   0.    0.]
 [   0.    0.]]
Previous weight [[1380. 3220.]
 [   0.    0.]
 [   0.    0.]]
Reward ##### Next Action #### Next State #### Q(S*, A*) #### Q(S, A) #### S
-1.0 1 [ 6 14] 48024.0 0.0 [ 6 14]
present weight [[  1380.   3220.]
 [288138. 672322.]
 [     0.      0.]]
Previous weight [[  1380.   3220.]
 [288138. 672322.]
 [     0.      0.]]
Reward ##### Next Action #### Next State #### Q(S*, A*) #### Q(S, A) #### S
-1.0 3 [ 6 14] 0.0 53360.0 [ 6 14]
present weight [[-318786. -743834.]
 [ 288138.  672322.]
 [      0.       0.]]
Previous weight [[-318786. -743834.]
 [ 288138.  672322.]

In [None]:
SARSA.weight

array([[1.86607717e-01, 1.56172973e-04],
       [1.69446301e-01, 2.51024367e-04],
       [1.97971159e-01, 1.11639122e-04]])

In [None]:
dir(env)

['__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_action_space',
 '_cached_spec',
 '_elapsed_steps',
 '_is_protocol',
 '_max_episode_steps',
 '_metadata',
 '_np_random',
 '_observation_space',
 '_reward_range',
 '_saved_kwargs',
 'action_space',
 'class_name',
 'close',
 'env',
 'get_wrapper_attr',
 'metadata',
 'np_random',
 'observation_space',
 'render',
 'render_mode',
 'reset',
 'reward_range',
 'spec',
 'step',
 'unwrapped',
 'wrapper_spec']