# Reinforcement Learning Mountain Car

## Env Setup

In [1]:
import gymnasium as gym
import numpy as np

In [2]:
env = gym.make("MountainCar-v0")
env.reset()

(array([-0.5102381,  0.       ], dtype=float32), {})

## Random Policy

In [3]:
episodeNumber = 10
timeSteps = 5000

for episodeIndex in range(episodeNumber):
    initial_state = env.reset()
    print("Episode: ", episodeIndex)
    env.render()
    for timeIndex in range(timeSteps):
        print("TimeStep:", timeIndex)
        random_action = env.action_space.sample()
        observation, reward, terminated, truncated, info = env.step(random_action)
        if terminated:
            break

env.close()

Episode:  0
TimeStep: 0
TimeStep: 1
TimeStep: 2
TimeStep: 3
TimeStep: 4
TimeStep: 5
TimeStep: 6
TimeStep: 7
TimeStep: 8
TimeStep: 9
TimeStep: 10
TimeStep: 11
TimeStep: 12
TimeStep: 13
TimeStep: 14
TimeStep: 15
TimeStep: 16
TimeStep: 17
TimeStep: 18
TimeStep: 19
TimeStep: 20
TimeStep: 21
TimeStep: 22
TimeStep: 23
TimeStep: 24
TimeStep: 25
TimeStep: 26
TimeStep: 27
TimeStep: 28
TimeStep: 29
TimeStep: 30
TimeStep: 31
TimeStep: 32
TimeStep: 33
TimeStep: 34
TimeStep: 35
TimeStep: 36
TimeStep: 37
TimeStep: 38
TimeStep: 39
TimeStep: 40
TimeStep: 41
TimeStep: 42
TimeStep: 43
TimeStep: 44
TimeStep: 45
TimeStep: 46
TimeStep: 47
TimeStep: 48
TimeStep: 49
TimeStep: 50
TimeStep: 51
TimeStep: 52
TimeStep: 53
TimeStep: 54
TimeStep: 55
TimeStep: 56
TimeStep: 57
TimeStep: 58
TimeStep: 59
TimeStep: 60
TimeStep: 61
TimeStep: 62
TimeStep: 63
TimeStep: 64
TimeStep: 65
TimeStep: 66
TimeStep: 67
TimeStep: 68
TimeStep: 69
TimeStep: 70
TimeStep: 71
TimeStep: 72
TimeStep: 73
TimeStep: 74
TimeStep: 75
TimeStep: 

  gym.logger.warn(


TimeStep: 1045
TimeStep: 1046
TimeStep: 1047
TimeStep: 1048
TimeStep: 1049
TimeStep: 1050
TimeStep: 1051
TimeStep: 1052
TimeStep: 1053
TimeStep: 1054
TimeStep: 1055
TimeStep: 1056
TimeStep: 1057
TimeStep: 1058
TimeStep: 1059
TimeStep: 1060
TimeStep: 1061
TimeStep: 1062
TimeStep: 1063
TimeStep: 1064
TimeStep: 1065
TimeStep: 1066
TimeStep: 1067
TimeStep: 1068
TimeStep: 1069
TimeStep: 1070
TimeStep: 1071
TimeStep: 1072
TimeStep: 1073
TimeStep: 1074
TimeStep: 1075
TimeStep: 1076
TimeStep: 1077
TimeStep: 1078
TimeStep: 1079
TimeStep: 1080
TimeStep: 1081
TimeStep: 1082
TimeStep: 1083
TimeStep: 1084
TimeStep: 1085
TimeStep: 1086
TimeStep: 1087
TimeStep: 1088
TimeStep: 1089
TimeStep: 1090
TimeStep: 1091
TimeStep: 1092
TimeStep: 1093
TimeStep: 1094
TimeStep: 1095
TimeStep: 1096
TimeStep: 1097
TimeStep: 1098
TimeStep: 1099
TimeStep: 1100
TimeStep: 1101
TimeStep: 1102
TimeStep: 1103
TimeStep: 1104
TimeStep: 1105
TimeStep: 1106
TimeStep: 1107
TimeStep: 1108
TimeStep: 1109
TimeStep: 1110
TimeStep: 

## Q-Learning Setup

In [4]:
class QLearning:
    def __init__(self, env, alpha, gamma, epsilon, numberOfEpisodes, numberOfBins, lowerBounds, upperBounds):
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon 
        self.numberOfActions = env.action_space.n 
        self.numberOfEpisodes = numberOfEpisodes
        self.numberOfBins = numberOfBins
        self.lowerBounds = lowerBounds
        self.upperBounds = upperBounds

        # rewards sum per episode
        self.sumEpisodeRewards=[]
        
        self.Qmatrix=np.random.uniform(low=0, high=1, size=(numberOfBins[0],numberOfBins[1],self.numberOfActions))

    # convert continuous values of the state into discrete values for Q-learning
    def getDiscreteState(self, state):
        position = state[0]
        velocity = state[1]

        # creating discrete bins for the continuous state space
        cartPositionBin = np.linspace(self.lowerBounds[0], self.upperBounds[0], self.numberOfBins[0])
        cartVelocityBin = np.linspace(self.lowerBounds[1], self.upperBounds[1], self.numberOfBins[1])

        # get indexes of the bins to which the continuous variables belong to
        indexPosition = np.maximum(np.digitize(position, cartPositionBin) - 1, 0)
        indexVelocity = np.maximum(np.digitize(velocity, cartVelocityBin) - 1, 0)

        return (indexPosition, indexVelocity)

    
    def selectAction(self, state, episodeNumber):

        # enabling random actions for exploration
        if episodeNumber < 500:
            return np.random.choice(self.numberOfActions)
        # eventually decreasing the value for epsilon to make the algorithm more greedy
        elif episodeNumber > 7000:
            self.epsilon = 0.999 * self.epsilon
        
        randomNumber = np.random.random()
        discreteState = self.getDiscreteState(state)

        if randomNumber < self.epsilon:
            return np.random.choice(self.numberOfActions)
        else:
            # select an action such that the Q-value for that action, state pair is the highest possible value in that state
            # np.max(self.Qmatrix[discreteState]))[0] -- will return a list of actions, as there could be multiple possible max actions
            return np.random.choice(np.where(self.Qmatrix[discreteState] == np.max(self.Qmatrix[discreteState]))[0])


    # simulating episodes with Q-learning
    def runEpisodes(self):
        for episodeIndex in range(self.numberOfEpisodes):

            state_S, _ = self.env.reset()
            state_S = list(state_S)

            episodeRewards = []

            print("Episode: ", episodeIndex)

            terminal_state = False
            for _ in range(500):
                state_S_discrete = self.getDiscreteState(state_S)
                action_a = self.selectAction(state_S, episodeIndex)

                state_S_prime, reward, terminal_state, _, _ = self.env.step(action_a)
                episodeRewards.append(reward)

                state_S_prime = list(state_S_prime)
                state_S_prime_discrete = self.getDiscreteState(state_S_prime)

                Q_max = np.max(self.Qmatrix[state_S_prime_discrete])

                if not terminal_state:
                    # update Q-values for non-terminal states
                    diff = reward + self.gamma*Q_max - self.Qmatrix[state_S_discrete + (action_a, )]
                    self.Qmatrix[state_S_discrete + (action_a, )] += self.alpha*diff
                else:
                    diff = reward - self.Qmatrix[state_S_discrete + (action_a, )]
                    self.Qmatrix[state_S_discrete + (action_a, )] += self.alpha*(reward - self.Qmatrix[state_S_discrete + (action_a, )])
                
                state_S = state_S_prime

                if terminal_state:
                    break
            
            
            print("Rewards: ", np.sum(episodeRewards))
            self.sumEpisodeRewards.append(np.sum(episodeRewards))

        
    # final optimal policy using the Q-matrix generated by running Q-learning
    def runOptimalPolicy(self):
        print("Running optimal policy")
        env1 = gym.make('MountainCar-v0', render_mode='human')
        curr_state, _ = env1.reset()
        env1.render()

        timeSteps = 1000

        for timeIndex in range(timeSteps):
            print("TimeStep:", timeIndex)
            curr_state_discrete = self.getDiscreteState(curr_state)
            # np.max(self.Qmatrix[discreteState]))[0] -- will return a list of actions, as there could be multiple possible max actions
            optimal_action = np.random.choice(np.where(self.Qmatrix[curr_state_discrete] == np.max(self.Qmatrix[curr_state_discrete]))[0])

            curr_state, reward, terminated, _, _ = env1.step(optimal_action)

            if (terminated):
                print(terminated)
                break

## Training agent and generating the optimal policy with Q-learning

In [5]:
env=gym.make('MountainCar-v0')
state, _ = env.reset()

print(state)

[-0.49505323  0.        ]


### Defining bounds and bins

In [6]:
upperBounds=env.observation_space.high 
lowerBounds=env.observation_space.low 

print(upperBounds)
print(lowerBounds)

[0.6  0.07]
[-1.2  -0.07]


In [7]:
num_bins = [30, 30]

### Defining parameters

In [8]:
alpha = 0.1
gamma = 1
epsilon = 0.2
numberOfEpisodes = 20000

### Running Q-Learning on the state specified above

In [9]:
Q = QLearning(env, alpha, gamma, epsilon, numberOfEpisodes, num_bins, lowerBounds, upperBounds)

In [10]:
Q.runEpisodes()

Episode:  0
Rewards:  -500.0
Episode:  1
Rewards:  -500.0
Episode:  2
Rewards:  -500.0
Episode:  3
Rewards:  -500.0
Episode:  4
Rewards:  -500.0
Episode:  5
Rewards:  -500.0
Episode:  6
Rewards:  -500.0
Episode:  7
Rewards:  -500.0
Episode:  8
Rewards:  -500.0
Episode:  9
Rewards:  -500.0
Episode:  10
Rewards:  -500.0
Episode:  11
Rewards:  -500.0
Episode:  12
Rewards:  -500.0
Episode:  13
Rewards:  -500.0
Episode:  14
Rewards:  -500.0
Episode:  15
Rewards:  -500.0
Episode:  16
Rewards:  -500.0
Episode:  17
Rewards:  -500.0
Episode:  18
Rewards:  -500.0
Episode:  19
Rewards:  -500.0
Episode:  20
Rewards:  -500.0
Episode:  21
Rewards:  -500.0
Episode:  22
Rewards:  -500.0
Episode:  23
Rewards:  -500.0
Episode:  24
Rewards:  -500.0
Episode:  25
Rewards:  -500.0
Episode:  26
Rewards:  -500.0
Episode:  27
Rewards:  -500.0
Episode:  28
Rewards:  -500.0
Episode:  29
Rewards:  -500.0
Episode:  30
Rewards:  -500.0
Episode:  31
Rewards:  -500.0
Episode:  32
Rewards:  -500.0
Episode:  33
Rewards

### Running the optimal policy

In [12]:
Q.runOptimalPolicy()

Running optimal policy
TimeStep: 0
TimeStep: 1
TimeStep: 2
TimeStep: 3
TimeStep: 4
TimeStep: 5
TimeStep: 6
TimeStep: 7
TimeStep: 8
TimeStep: 9
TimeStep: 10
TimeStep: 11
TimeStep: 12
TimeStep: 13
TimeStep: 14
TimeStep: 15
TimeStep: 16
TimeStep: 17
TimeStep: 18
TimeStep: 19
TimeStep: 20
TimeStep: 21
TimeStep: 22
TimeStep: 23
TimeStep: 24
TimeStep: 25
TimeStep: 26
TimeStep: 27
TimeStep: 28
TimeStep: 29
TimeStep: 30
TimeStep: 31
TimeStep: 32
TimeStep: 33
TimeStep: 34
TimeStep: 35
TimeStep: 36
TimeStep: 37
TimeStep: 38
TimeStep: 39
TimeStep: 40
TimeStep: 41
TimeStep: 42
TimeStep: 43
TimeStep: 44
TimeStep: 45
TimeStep: 46
TimeStep: 47
TimeStep: 48
TimeStep: 49
TimeStep: 50
TimeStep: 51
TimeStep: 52
TimeStep: 53
TimeStep: 54
TimeStep: 55
TimeStep: 56
TimeStep: 57
TimeStep: 58
TimeStep: 59
TimeStep: 60
TimeStep: 61
TimeStep: 62
TimeStep: 63
TimeStep: 64
TimeStep: 65
TimeStep: 66
TimeStep: 67
TimeStep: 68
TimeStep: 69
TimeStep: 70
TimeStep: 71
TimeStep: 72
TimeStep: 73
TimeStep: 74
TimeStep: 75